rss-tools: vendor/golang.org/x/net/html/parse.go (master)

1

// Copyright 2010 The Go Authors. All rights reserved.

2

// Use of this source code is governed by a BSD-style

3

// license that can be found in the LICENSE file.

4

5

package html

6

7

import (

8

	"errors"

9

	"fmt"

10

	"io"

11

	"strings"

12

13

	a "golang.org/x/net/html/atom"

14

15

16

// A parser implements the HTML5 parsing algorithm:

17

// https://html.spec.whatwg.org/multipage/syntax.html#tree-construction

18

type parser struct {

19

	// tokenizer provides the tokens for the parser.

20

	tokenizer *Tokenizer

21

	// tok is the most recently read token.

22

	tok Token

23

	// Self-closing tags like <hr/> are treated as start tags, except that

24

	// hasSelfClosingToken is set while they are being processed.

25

	hasSelfClosingToken bool

26

	// doc is the document root element.

27

	doc *Node

28

	// The stack of open elements (section 12.2.4.2) and active formatting

29

	// elements (section 12.2.4.3).

30

	oe, afe nodeStack

31

	// Element pointers (section 12.2.4.4).

32

	head, form *Node

33

	// Other parsing state flags (section 12.2.4.5).

34

	scripting, framesetOK bool

35

	// The stack of template insertion modes

36

	templateStack insertionModeStack

37

	// im is the current insertion mode.

38

	im insertionMode

39

	// originalIM is the insertion mode to go back to after completing a text

40

	// or inTableText insertion mode.

41

	originalIM insertionMode

42

	// fosterParenting is whether new elements should be inserted according to

43

	// the foster parenting rules (section 12.2.6.1).

44

	fosterParenting bool

45

	// quirks is whether the parser is operating in "quirks mode."

46

	quirks bool

47

	// fragment is whether the parser is parsing an HTML fragment.

48

	fragment bool

49

	// context is the context element when parsing an HTML fragment

50

	// (section 12.4).

51

	context *Node

52

53

54

func (p *parser) top() *Node {

55

	if n := p.oe.top(); n != nil {

56

		return n

57

58

	return p.doc

59

60

61

// Stop tags for use in popUntil. These come from section 12.2.4.2.

62

var (

63

	defaultScopeStopTags = map[string][]a.Atom{

64

		"":     {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object, a.Template},

65

		"math": {a.AnnotationXml, a.Mi, a.Mn, a.Mo, a.Ms, a.Mtext},

66

		"svg":  {a.Desc, a.ForeignObject, a.Title},

67

68

69

70

type scope int

71

72

const (

73

	defaultScope scope = iota

74

	listItemScope

75

	buttonScope

76

	tableScope

77

	tableRowScope

78

	tableBodyScope

79

	selectScope

80

81

82

// popUntil pops the stack of open elements at the highest element whose tag

83

// is in matchTags, provided there is no higher element in the scope's stop

84

// tags (as defined in section 12.2.4.2). It returns whether or not there was

85

// such an element. If there was not, popUntil leaves the stack unchanged.

86

//

87

// For example, the set of stop tags for table scope is: "html", "table". If

88

// the stack was:

89

// ["html", "body", "font", "table", "b", "i", "u"]

90

// then popUntil(tableScope, "font") would return false, but

91

// popUntil(tableScope, "i") would return true and the stack would become:

92

// ["html", "body", "font", "table", "b"]

93

//

94

// If an element's tag is in both the stop tags and matchTags, then the stack

95

// will be popped and the function returns true (provided, of course, there was

96

// no higher element in the stack that was also in the stop tags). For example,

97

// popUntil(tableScope, "table") returns true and leaves:

98

// ["html", "body", "font"]

99

func (p *parser) popUntil(s scope, matchTags ...a.Atom) bool {

100

	if i := p.indexOfElementInScope(s, matchTags...); i != -1 {

101

		p.oe = p.oe[:i]

102

		return true

103

104

	return false

105

106

107

// indexOfElementInScope returns the index in p.oe of the highest element whose

108

// tag is in matchTags that is in scope. If no matching element is in scope, it

109

// returns -1.

110

func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int {

111

	for i := len(p.oe) - 1; i >= 0; i-- {

112

		tagAtom := p.oe[i].DataAtom

113

		if p.oe[i].Namespace == "" {

114

			for _, t := range matchTags {

115

				if t == tagAtom {

116

					return i

117

118

119

			switch s {

120

			case defaultScope:

121

				// No-op.

122

			case listItemScope:

123

				if tagAtom == a.Ol || tagAtom == a.Ul {

124

					return -1

125

126

			case buttonScope:

127

				if tagAtom == a.Button {

128

					return -1

129

130

			case tableScope:

131

				if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template {

132

					return -1

133

134

			case selectScope:

135

				if tagAtom != a.Optgroup && tagAtom != a.Option {

136

					return -1

137

138

			default:

139

				panic(fmt.Sprintf("html: internal error: indexOfElementInScope unknown scope: %d", s))

140

141

142

		switch s {

143

		case defaultScope, listItemScope, buttonScope:

144

			for _, t := range defaultScopeStopTags[p.oe[i].Namespace] {

145

				if t == tagAtom {

146

					return -1

147

148

149

150

151

	return -1

152

153

154

// elementInScope is like popUntil, except that it doesn't modify the stack of

155

// open elements.

156

func (p *parser) elementInScope(s scope, matchTags ...a.Atom) bool {

157

	return p.indexOfElementInScope(s, matchTags...) != -1

158

159

160

// clearStackToContext pops elements off the stack of open elements until a

161

// scope-defined element is found.

162

func (p *parser) clearStackToContext(s scope) {

163

	for i := len(p.oe) - 1; i >= 0; i-- {

164

		tagAtom := p.oe[i].DataAtom

165

		switch s {

166

		case tableScope:

167

			if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template {

168

				p.oe = p.oe[:i+1]

169

				return

170

171

		case tableRowScope:

172

			if tagAtom == a.Html || tagAtom == a.Tr || tagAtom == a.Template {

173

				p.oe = p.oe[:i+1]

174

				return

175

176

		case tableBodyScope:

177

			if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead || tagAtom == a.Template {

178

				p.oe = p.oe[:i+1]

179

				return

180

181

		default:

182

			panic(fmt.Sprintf("html: internal error: clearStackToContext unknown scope: %d", s))

183

184

185

186

187

// parseGenericRawTextElement implements the generic raw text element parsing

188

// algorithm defined in 12.2.6.2.

189

// https://html.spec.whatwg.org/multipage/parsing.html#parsing-elements-that-contain-only-text

190

// TODO: Since both RAWTEXT and RCDATA states are treated as tokenizer's part

191

// officially, need to make tokenizer consider both states.

192

func (p *parser) parseGenericRawTextElement() {

193

	p.addElement()

194

	p.originalIM = p.im

195

	p.im = textIM

196

197

198

// generateImpliedEndTags pops nodes off the stack of open elements as long as

199

// the top node has a tag name of dd, dt, li, optgroup, option, p, rb, rp, rt or rtc.

200

// If exceptions are specified, nodes with that name will not be popped off.

201

func (p *parser) generateImpliedEndTags(exceptions ...string) {

202

	var i int

203

loop:

204

	for i = len(p.oe) - 1; i >= 0; i-- {

205

		n := p.oe[i]

206

		if n.Type != ElementNode {

207

			break

208

209

		switch n.DataAtom {

210

		case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc:

211

			for _, except := range exceptions {

212

				if n.Data == except {

213

					break loop

214

215

216

			continue

217

218

		break

219

220

221

	p.oe = p.oe[:i+1]

222

223

224

// addChild adds a child node n to the top element, and pushes n onto the stack

225

// of open elements if it is an element node.

226

func (p *parser) addChild(n *Node) {

227

	if p.shouldFosterParent() {

228

		p.fosterParent(n)

229

	} else {

230

		p.top().AppendChild(n)

231

232

233

	if n.Type == ElementNode {

234

		p.insertOpenElement(n)

235

236

237

238

func (p *parser) insertOpenElement(n *Node) {

239

	p.oe = append(p.oe, n)

240

	if len(p.oe) > 512 {

241

		panic("html: open stack of elements exceeds 512 nodes")

242

243

244

245

// shouldFosterParent returns whether the next node to be added should be

246

// foster parented.

247

func (p *parser) shouldFosterParent() bool {

248

	if p.fosterParenting {

249

		switch p.top().DataAtom {

250

		case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:

251

			return true

252

253

254

	return false

255

256

257

// fosterParent adds a child node according to the foster parenting rules.

258

// Section 12.2.6.1, "foster parenting".

259

func (p *parser) fosterParent(n *Node) {

260

	var table, parent, prev, template *Node

261

	var i int

262

	for i = len(p.oe) - 1; i >= 0; i-- {

263

		if p.oe[i].DataAtom == a.Table {

264

			table = p.oe[i]

265

			break

266

267

268

269

	var j int

270

	for j = len(p.oe) - 1; j >= 0; j-- {

271

		if p.oe[j].DataAtom == a.Template {

272

			template = p.oe[j]

273

			break

274

275

276

277

	if template != nil && (table == nil || j > i) {

278

		template.AppendChild(n)

279

		return

280

281

282

	if table == nil {

283

		// The foster parent is the html element.

284

		parent = p.oe[0]

285

	} else {

286

		parent = table.Parent

287

288

	if parent == nil {

289

		parent = p.oe[i-1]

290

291

292

	if table != nil {

293

		prev = table.PrevSibling

294

	} else {

295

		prev = parent.LastChild

296

297

	if prev != nil && prev.Type == TextNode && n.Type == TextNode {

298

		prev.Data += n.Data

299

		return

300

301

302

	parent.InsertBefore(n, table)

303

304

305

// addText adds text to the preceding node if it is a text node, or else it

306

// calls addChild with a new text node.

307

func (p *parser) addText(text string) {

308

	if text == "" {

309

		return

310

311

312

	if p.shouldFosterParent() {

313

		p.fosterParent(&Node{

314

			Type: TextNode,

315

			Data: text,

316

})

317

		return

318

319

320

	t := p.top()

321

	if n := t.LastChild; n != nil && n.Type == TextNode {

322

		n.Data += text

323

		return

324

325

	p.addChild(&Node{

326

		Type: TextNode,

327

		Data: text,

328

})

329

330

331

// addElement adds a child element based on the current token.

332

func (p *parser) addElement() {

333

	p.addChild(&Node{

334

		Type:     ElementNode,

335

		DataAtom: p.tok.DataAtom,

336

		Data:     p.tok.Data,

337

		Attr:     p.tok.Attr,

338

})

339

340

341

// Section 12.2.4.3.

342

func (p *parser) addFormattingElement() {

343

	tagAtom, attr := p.tok.DataAtom, p.tok.Attr

344

	p.addElement()

345

346

	// Implement the Noah's Ark clause, but with three per family instead of two.

347

	identicalElements := 0

348

findIdenticalElements:

349

	for i := len(p.afe) - 1; i >= 0; i-- {

350

		n := p.afe[i]

351

		if n.Type == scopeMarkerNode {

352

			break

353

354

		if n.Type != ElementNode {

355

			continue

356

357

		if n.Namespace != "" {

358

			continue

359

360

		if n.DataAtom != tagAtom {

361

			continue

362

363

		if len(n.Attr) != len(attr) {

364

			continue

365

366

	compareAttributes:

367

		for _, t0 := range n.Attr {

368

			for _, t1 := range attr {

369

				if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val {

370

					// Found a match for this attribute, continue with the next attribute.

371

					continue compareAttributes

372

373

374

			// If we get here, there is no attribute that matches a.

375

			// Therefore the element is not identical to the new one.

376

			continue findIdenticalElements

377

378

379

		identicalElements++

380

		if identicalElements >= 3 {

381

			p.afe.remove(n)

382

383

384

385

	p.afe = append(p.afe, p.top())

386

387

388

// Section 12.2.4.3.

389

func (p *parser) clearActiveFormattingElements() {

390

	for {

391

		if n := p.afe.pop(); len(p.afe) == 0 || n.Type == scopeMarkerNode {

392

			return

393

394

395

396

397

// Section 12.2.4.3.

398

func (p *parser) reconstructActiveFormattingElements() {

399

	n := p.afe.top()

400

	if n == nil {

401

		return

402

403

	if n.Type == scopeMarkerNode || p.oe.index(n) != -1 {

404

		return

405

406

	i := len(p.afe) - 1

407

	for n.Type != scopeMarkerNode && p.oe.index(n) == -1 {

408

		if i == 0 {

409

			i = -1

410

			break

411

412

i--

413

		n = p.afe[i]

414

415

	for {

416

i++

417

		clone := p.afe[i].clone()

418

		p.addChild(clone)

419

		p.afe[i] = clone

420

		if i == len(p.afe)-1 {

421

			break

422

423

424

425

426

// Section 12.2.5.

427

func (p *parser) acknowledgeSelfClosingTag() {

428

	p.hasSelfClosingToken = false

429

430

431

// An insertion mode (section 12.2.4.1) is the state transition function from

432

// a particular state in the HTML5 parser's state machine. It updates the

433

// parser's fields depending on parser.tok (where ErrorToken means EOF).

434

// It returns whether the token was consumed.

435

type insertionMode func(*parser) bool

436

437

// setOriginalIM sets the insertion mode to return to after completing a text or

438

// inTableText insertion mode.

439

// Section 12.2.4.1, "using the rules for".

440

func (p *parser) setOriginalIM() {

441

	if p.originalIM != nil {

442

		panic("html: bad parser state: originalIM was set twice")

443

444

	p.originalIM = p.im

445

446

447

// Section 12.2.4.1, "reset the insertion mode".

448

func (p *parser) resetInsertionMode() {

449

	for i := len(p.oe) - 1; i >= 0; i-- {

450

		n := p.oe[i]

451

		last := i == 0

452

		if last && p.context != nil {

453

			n = p.context

454

455

456

		switch n.DataAtom {

457

		case a.Select:

458

			if !last {

459

				for ancestor, first := n, p.oe[0]; ancestor != first; {

460

					ancestor = p.oe[p.oe.index(ancestor)-1]

461

					switch ancestor.DataAtom {

462

					case a.Template:

463

						p.im = inSelectIM

464

						return

465

					case a.Table:

466

						p.im = inSelectInTableIM

467

						return

468

469

470

471

			p.im = inSelectIM

472

		case a.Td, a.Th:

473

			// TODO: remove this divergence from the HTML5 spec.

474

//

475

			// See https://bugs.chromium.org/p/chromium/issues/detail?id=829668

476

			p.im = inCellIM

477

		case a.Tr:

478

			p.im = inRowIM

479

		case a.Tbody, a.Thead, a.Tfoot:

480

			p.im = inTableBodyIM

481

		case a.Caption:

482

			p.im = inCaptionIM

483

		case a.Colgroup:

484

			p.im = inColumnGroupIM

485

		case a.Table:

486

			p.im = inTableIM

487

		case a.Template:

488

			// TODO: remove this divergence from the HTML5 spec.

489

			if n.Namespace != "" {

490

				continue

491

492

			p.im = p.templateStack.top()

493

		case a.Head:

494

			// TODO: remove this divergence from the HTML5 spec.

495

//

496

			// See https://bugs.chromium.org/p/chromium/issues/detail?id=829668

497

			p.im = inHeadIM

498

		case a.Body:

499

			p.im = inBodyIM

500

		case a.Frameset:

501

			p.im = inFramesetIM

502

		case a.Html:

503

			if p.head == nil {

504

				p.im = beforeHeadIM

505

			} else {

506

				p.im = afterHeadIM

507

508

		default:

509

			if last {

510

				p.im = inBodyIM

511

				return

512

513

			continue

514

515

		return

516

517

518

519

const whitespace = " \t\r\n\f"

520

521

// Section 12.2.6.4.1.

522

func initialIM(p *parser) bool {

523

	switch p.tok.Type {

524

	case TextToken:

525

		p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)

526

		if len(p.tok.Data) == 0 {

527

			// It was all whitespace, so ignore it.

528

			return true

529

530

	case CommentToken:

531

		p.doc.AppendChild(&Node{

532

			Type: CommentNode,

533

			Data: p.tok.Data,

534

})

535

		return true

536

	case DoctypeToken:

537

		n, quirks := parseDoctype(p.tok.Data)

538

		p.doc.AppendChild(n)

539

		p.quirks = quirks

540

		p.im = beforeHTMLIM

541

		return true

542

543

	p.quirks = true

544

	p.im = beforeHTMLIM

545

	return false

546

547

548

// Section 12.2.6.4.2.

549

func beforeHTMLIM(p *parser) bool {

550

	switch p.tok.Type {

551

	case DoctypeToken:

552

		// Ignore the token.

553

		return true

554

	case TextToken:

555

		p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)

556

		if len(p.tok.Data) == 0 {

557

			// It was all whitespace, so ignore it.

558

			return true

559

560

	case StartTagToken:

561

		if p.tok.DataAtom == a.Html {

562

			p.addElement()

563

			p.im = beforeHeadIM

564

			return true

565

566

	case EndTagToken:

567

		switch p.tok.DataAtom {

568

		case a.Head, a.Body, a.Html, a.Br:

569

			p.parseImpliedToken(StartTagToken, a.Html, a.Html.String())

570

			return false

571

		default:

572

			// Ignore the token.

573

			return true

574

575

	case CommentToken:

576

		p.doc.AppendChild(&Node{

577

			Type: CommentNode,

578

			Data: p.tok.Data,

579

})

580

		return true

581

582

	p.parseImpliedToken(StartTagToken, a.Html, a.Html.String())

583

	return false

584

585

586

// Section 12.2.6.4.3.

587

func beforeHeadIM(p *parser) bool {

588

	switch p.tok.Type {

589

	case TextToken:

590

		p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)

591

		if len(p.tok.Data) == 0 {

592

			// It was all whitespace, so ignore it.

593

			return true

594

595

	case StartTagToken:

596

		switch p.tok.DataAtom {

597

		case a.Head:

598

			p.addElement()

599

			p.head = p.top()

600

			p.im = inHeadIM

601

			return true

602

		case a.Html:

603

			return inBodyIM(p)

604

605

	case EndTagToken:

606

		switch p.tok.DataAtom {

607

		case a.Head, a.Body, a.Html, a.Br:

608

			p.parseImpliedToken(StartTagToken, a.Head, a.Head.String())

609

			return false

610

		default:

611

			// Ignore the token.

612

			return true

613

614

	case CommentToken:

615

		p.addChild(&Node{

616

			Type: CommentNode,

617

			Data: p.tok.Data,

618

})

619

		return true

620

	case DoctypeToken:

621

		// Ignore the token.

622

		return true

623

624

625

	p.parseImpliedToken(StartTagToken, a.Head, a.Head.String())

626

	return false

627

628

629

// Section 12.2.6.4.4.

630

func inHeadIM(p *parser) bool {

631

	switch p.tok.Type {

632

	case TextToken:

633

		s := strings.TrimLeft(p.tok.Data, whitespace)

634

		if len(s) < len(p.tok.Data) {

635

			// Add the initial whitespace to the current node.

636

			p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])

637

			if s == "" {

638

				return true

639

640

			p.tok.Data = s

641

642

	case StartTagToken:

643

		switch p.tok.DataAtom {

644

		case a.Html:

645

			return inBodyIM(p)

646

		case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta:

647

			p.addElement()

648

			p.oe.pop()

649

			p.acknowledgeSelfClosingTag()

650

			return true

651

		case a.Noscript:

652

			if p.scripting {

653

				p.parseGenericRawTextElement()

654

				return true

655

656

			p.addElement()

657

			p.im = inHeadNoscriptIM

658

			// Don't let the tokenizer go into raw text mode when scripting is disabled.

659

			p.tokenizer.NextIsNotRawText()

660

			return true

661

		case a.Script, a.Title:

662

			p.addElement()

663

			p.setOriginalIM()

664

			p.im = textIM

665

			return true

666

		case a.Noframes, a.Style:

667

			p.parseGenericRawTextElement()

668

			return true

669

		case a.Head:

670

			// Ignore the token.

671

			return true

672

		case a.Template:

673

			// TODO: remove this divergence from the HTML5 spec.

674

//

675

			// We don't handle all of the corner cases when mixing foreign

676

			// content (i.e. <math> or <svg>) with <template>. Without this

677

			// early return, we can get into an infinite loop, possibly because

678

			// of the "TODO... further divergence" a little below.

679

//

680

			// As a workaround, if we are mixing foreign content and templates,

681

			// just ignore the rest of the HTML. Foreign content is rare and a

682

			// relatively old HTML feature. Templates are also rare and a

683

			// relatively new HTML feature. Their combination is very rare.

684

			for _, e := range p.oe {

685

				if e.Namespace != "" {

686

					p.im = ignoreTheRemainingTokens

687

					return true

688

689

690

691

			p.addElement()

692

			p.afe = append(p.afe, &scopeMarker)

693

			p.framesetOK = false

694

			p.im = inTemplateIM

695

			p.templateStack = append(p.templateStack, inTemplateIM)

696

			return true

697

698

	case EndTagToken:

699

		switch p.tok.DataAtom {

700

		case a.Head:

701

			p.oe.pop()

702

			p.im = afterHeadIM

703

			return true

704

		case a.Body, a.Html, a.Br:

705

			p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())

706

			return false

707

		case a.Template:

708

			if !p.oe.contains(a.Template) {

709

				return true

710

711

			// TODO: remove this further divergence from the HTML5 spec.

712

//

713

			// See https://bugs.chromium.org/p/chromium/issues/detail?id=829668

714

			p.generateImpliedEndTags()

715

			for i := len(p.oe) - 1; i >= 0; i-- {

716

				if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template {

717

					p.oe = p.oe[:i]

718

					break

719

720

721

			p.clearActiveFormattingElements()

722

			p.templateStack.pop()

723

			p.resetInsertionMode()

724

			return true

725

		default:

726

			// Ignore the token.

727

			return true

728

729

	case CommentToken:

730

		p.addChild(&Node{

731

			Type: CommentNode,

732

			Data: p.tok.Data,

733

})

734

		return true

735

	case DoctypeToken:

736

		// Ignore the token.

737

		return true

738

739

740

	p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())

741

	return false

742

743

744

// Section 12.2.6.4.5.

745

func inHeadNoscriptIM(p *parser) bool {

746

	switch p.tok.Type {

747

	case DoctypeToken:

748

		// Ignore the token.

749

		return true

750

	case StartTagToken:

751

		switch p.tok.DataAtom {

752

		case a.Html:

753

			return inBodyIM(p)

754

		case a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Style:

755

			return inHeadIM(p)

756

		case a.Head:

757

			// Ignore the token.

758

			return true

759

		case a.Noscript:

760

			// Don't let the tokenizer go into raw text mode even when a <noscript>

761

			// tag is in "in head noscript" insertion mode.

762

			p.tokenizer.NextIsNotRawText()

763

			// Ignore the token.

764

			return true

765

766

	case EndTagToken:

767

		switch p.tok.DataAtom {

768

		case a.Noscript, a.Br:

769

		default:

770

			// Ignore the token.

771

			return true

772

773

	case TextToken:

774

		s := strings.TrimLeft(p.tok.Data, whitespace)

775

		if len(s) == 0 {

776

			// It was all whitespace.

777

			return inHeadIM(p)

778

779

	case CommentToken:

780

		return inHeadIM(p)

781

782

	p.oe.pop()

783

	if p.top().DataAtom != a.Head {

784

		panic("html: the new current node will be a head element.")

785

786

	p.im = inHeadIM

787

	if p.tok.DataAtom == a.Noscript {

788

		return true

789

790

	return false

791

792

793

// Section 12.2.6.4.6.

794

func afterHeadIM(p *parser) bool {

795

	switch p.tok.Type {

796

	case TextToken:

797

		s := strings.TrimLeft(p.tok.Data, whitespace)

798

		if len(s) < len(p.tok.Data) {

799

			// Add the initial whitespace to the current node.

800

			p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])

801

			if s == "" {

802

				return true

803

804

			p.tok.Data = s

805

806

	case StartTagToken:

807

		switch p.tok.DataAtom {

808

		case a.Html:

809

			return inBodyIM(p)

810

		case a.Body:

811

			p.addElement()

812

			p.framesetOK = false

813

			p.im = inBodyIM

814

			return true

815

		case a.Frameset:

816

			p.addElement()

817

			p.im = inFramesetIM

818

			return true

819

		case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:

820

			p.insertOpenElement(p.head)

821

			defer p.oe.remove(p.head)

822

			return inHeadIM(p)

823

		case a.Head:

824

			// Ignore the token.

825

			return true

826

827

	case EndTagToken:

828

		switch p.tok.DataAtom {

829

		case a.Body, a.Html, a.Br:

830

			// Drop down to creating an implied <body> tag.

831

		case a.Template:

832

			return inHeadIM(p)

833

		default:

834

			// Ignore the token.

835

			return true

836

837

	case CommentToken:

838

		p.addChild(&Node{

839

			Type: CommentNode,

840

			Data: p.tok.Data,

841

})

842

		return true

843

	case DoctypeToken:

844

		// Ignore the token.

845

		return true

846

847

848

	p.parseImpliedToken(StartTagToken, a.Body, a.Body.String())

849

	p.framesetOK = true

850

	if p.tok.Type == ErrorToken {

851

		// Stop parsing.

852

		return true

853

854

	return false

855

856

857

// copyAttributes copies attributes of src not found on dst to dst.

858

func copyAttributes(dst *Node, src Token) {

859

	if len(src.Attr) == 0 {

860

		return

861

862

	attr := map[string]string{}

863

	for _, t := range dst.Attr {

864

		attr[t.Key] = t.Val

865

866

	for _, t := range src.Attr {

867

		if _, ok := attr[t.Key]; !ok {

868

			dst.Attr = append(dst.Attr, t)

869

			attr[t.Key] = t.Val

870

871

872

873

874

// Section 12.2.6.4.7.

875

func inBodyIM(p *parser) bool {

876

	switch p.tok.Type {

877

	case TextToken:

878

		d := p.tok.Data

879

		switch n := p.oe.top(); n.DataAtom {

880

		case a.Pre, a.Listing:

881

			if n.FirstChild == nil {

882

				// Ignore a newline at the start of a <pre> block.

883

				if d != "" && d[0] == '\r' {

884

					d = d[1:]

885

886

				if d != "" && d[0] == '\n' {

887

					d = d[1:]

888

889

890

891

		d = strings.Replace(d, "\x00", "", -1)

892

		if d == "" {

893

			return true

894

895

		p.reconstructActiveFormattingElements()

896

		p.addText(d)

897

		if p.framesetOK && strings.TrimLeft(d, whitespace) != "" {

898

			// There were non-whitespace characters inserted.

899

			p.framesetOK = false

900

901

	case StartTagToken:

902

		switch p.tok.DataAtom {

903

		case a.Html:

904

			if p.oe.contains(a.Template) {

905

				return true

906

907

			copyAttributes(p.oe[0], p.tok)

908

		case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:

909

			return inHeadIM(p)

910

		case a.Body:

911

			if p.oe.contains(a.Template) {

912

				return true

913

914

			if len(p.oe) >= 2 {

915

				body := p.oe[1]

916

				if body.Type == ElementNode && body.DataAtom == a.Body {

917

					p.framesetOK = false

918

					copyAttributes(body, p.tok)

919

920

921

		case a.Frameset:

922

			if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body {

923

				// Ignore the token.

924

				return true

925

926

			body := p.oe[1]

927

			if body.Parent != nil {

928

				body.Parent.RemoveChild(body)

929

930

			p.oe = p.oe[:1]

931

			p.addElement()

932

			p.im = inFramesetIM

933

			return true

934

		case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Main, a.Menu, a.Nav, a.Ol, a.P, a.Search, a.Section, a.Summary, a.Ul:

935

			p.popUntil(buttonScope, a.P)

936

			p.addElement()

937

		case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:

938

			p.popUntil(buttonScope, a.P)

939

			switch n := p.top(); n.DataAtom {

940

			case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:

941

				p.oe.pop()

942

943

			p.addElement()

944

		case a.Pre, a.Listing:

945

			p.popUntil(buttonScope, a.P)

946

			p.addElement()

947

			// The newline, if any, will be dealt with by the TextToken case.

948

			p.framesetOK = false

949

		case a.Form:

950

			if p.form != nil && !p.oe.contains(a.Template) {

951

				// Ignore the token

952

				return true

953

954

			p.popUntil(buttonScope, a.P)

955

			p.addElement()

956

			if !p.oe.contains(a.Template) {

957

				p.form = p.top()

958

959

		case a.Li:

960

			p.framesetOK = false

961

			for i := len(p.oe) - 1; i >= 0; i-- {

962

				node := p.oe[i]

963

				switch node.DataAtom {

964

				case a.Li:

965

					p.oe = p.oe[:i]

966

				case a.Address, a.Div, a.P:

967

					continue

968

				default:

969

					if !isSpecialElement(node) {

970

						continue

971

972

973

				break

974

975

			p.popUntil(buttonScope, a.P)

976

			p.addElement()

977

		case a.Dd, a.Dt:

978

			p.framesetOK = false

979

			for i := len(p.oe) - 1; i >= 0; i-- {

980

				node := p.oe[i]

981

				switch node.DataAtom {

982

				case a.Dd, a.Dt:

983

					p.oe = p.oe[:i]

984

				case a.Address, a.Div, a.P:

985

					continue

986

				default:

987

					if !isSpecialElement(node) {

988

						continue

989

990

991

				break

992

993

			p.popUntil(buttonScope, a.P)

994

			p.addElement()

995

		case a.Plaintext:

996

			p.popUntil(buttonScope, a.P)

997

			p.addElement()

998

		case a.Button:

999

			p.popUntil(defaultScope, a.Button)

1000

			p.reconstructActiveFormattingElements()

1001

			p.addElement()

1002

			p.framesetOK = false

1003

		case a.A:

1004

			for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- {

1005

				if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A {

1006

					p.inBodyEndTagFormatting(a.A, "a")

1007

					p.oe.remove(n)

1008

					p.afe.remove(n)

1009

					break

1010

1011

1012

			p.reconstructActiveFormattingElements()

1013

			p.addFormattingElement()

1014

		case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:

1015

			p.reconstructActiveFormattingElements()

1016

			p.addFormattingElement()

1017

		case a.Nobr:

1018

			p.reconstructActiveFormattingElements()

1019

			if p.elementInScope(defaultScope, a.Nobr) {

1020

				p.inBodyEndTagFormatting(a.Nobr, "nobr")

1021

				p.reconstructActiveFormattingElements()

1022

1023

			p.addFormattingElement()

1024

		case a.Applet, a.Marquee, a.Object:

1025

			p.reconstructActiveFormattingElements()

1026

			p.addElement()

1027

			p.afe = append(p.afe, &scopeMarker)

1028

			p.framesetOK = false

1029

		case a.Table:

1030

			if !p.quirks {

1031

				p.popUntil(buttonScope, a.P)

1032

1033

			p.addElement()

1034

			p.framesetOK = false

1035

			p.im = inTableIM

1036

			return true

1037

		case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr:

1038

			p.reconstructActiveFormattingElements()

1039

			p.addElement()

1040

			p.oe.pop()

1041

			p.acknowledgeSelfClosingTag()

1042

			if p.tok.DataAtom == a.Input {

1043

				for _, t := range p.tok.Attr {

1044

					if t.Key == "type" {

1045

						if strings.EqualFold(t.Val, "hidden") {

1046

							// Skip setting framesetOK = false

1047

							return true

1048

1049

1050

1051

1052

			p.framesetOK = false

1053

		case a.Param, a.Source, a.Track:

1054

			p.addElement()

1055

			p.oe.pop()

1056

			p.acknowledgeSelfClosingTag()

1057

		case a.Hr:

1058

			p.popUntil(buttonScope, a.P)

1059

			p.addElement()

1060

			p.oe.pop()

1061

			p.acknowledgeSelfClosingTag()

1062

			p.framesetOK = false

1063

		case a.Image:

1064

			p.tok.DataAtom = a.Img

1065

			p.tok.Data = a.Img.String()

1066

			return false

1067

		case a.Textarea:

1068

			p.addElement()

1069

			p.setOriginalIM()

1070

			p.framesetOK = false

1071

			p.im = textIM

1072

		case a.Xmp:

1073

			p.popUntil(buttonScope, a.P)

1074

			p.reconstructActiveFormattingElements()

1075

			p.framesetOK = false

1076

			p.parseGenericRawTextElement()

1077

		case a.Iframe:

1078

			p.framesetOK = false

1079

			p.parseGenericRawTextElement()

1080

		case a.Noembed:

1081

			p.parseGenericRawTextElement()

1082

		case a.Noscript:

1083

			if p.scripting {

1084

				p.parseGenericRawTextElement()

1085

				return true

1086

1087

			p.reconstructActiveFormattingElements()

1088

			p.addElement()

1089

			// Don't let the tokenizer go into raw text mode when scripting is disabled.

1090

			p.tokenizer.NextIsNotRawText()

1091

		case a.Select:

1092

			p.reconstructActiveFormattingElements()

1093

			p.addElement()

1094

			p.framesetOK = false

1095

			p.im = inSelectIM

1096

			return true

1097

		case a.Optgroup, a.Option:

1098

			if p.top().DataAtom == a.Option {

1099

				p.oe.pop()

1100

1101

			p.reconstructActiveFormattingElements()

1102

			p.addElement()

1103

		case a.Rb, a.Rtc:

1104

			if p.elementInScope(defaultScope, a.Ruby) {

1105

				p.generateImpliedEndTags()

1106

1107

			p.addElement()

1108

		case a.Rp, a.Rt:

1109

			if p.elementInScope(defaultScope, a.Ruby) {

1110

				p.generateImpliedEndTags("rtc")

1111

1112

			p.addElement()

1113

		case a.Math, a.Svg:

1114

			p.reconstructActiveFormattingElements()

1115

			if p.tok.DataAtom == a.Math {

1116

				adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)

1117

			} else {

1118

				adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)

1119

1120

			adjustForeignAttributes(p.tok.Attr)

1121

			p.addElement()

1122

			p.top().Namespace = p.tok.Data

1123

			if p.hasSelfClosingToken {

1124

				p.oe.pop()

1125

				p.acknowledgeSelfClosingTag()

1126

1127

			return true

1128

		case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:

1129

			// Ignore the token.

1130

		default:

1131

			p.reconstructActiveFormattingElements()

1132

			p.addElement()

1133

1134

	case EndTagToken:

1135

		switch p.tok.DataAtom {

1136

		case a.Body:

1137

			if p.elementInScope(defaultScope, a.Body) {

1138

				p.im = afterBodyIM

1139

1140

		case a.Html:

1141

			if p.elementInScope(defaultScope, a.Body) {

1142

				p.parseImpliedToken(EndTagToken, a.Body, a.Body.String())

1143

				return false

1144

1145

			return true

1146

		case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Main, a.Menu, a.Nav, a.Ol, a.Pre, a.Search, a.Section, a.Summary, a.Ul:

1147

			p.popUntil(defaultScope, p.tok.DataAtom)

1148

		case a.Form:

1149

			if p.oe.contains(a.Template) {

1150

				i := p.indexOfElementInScope(defaultScope, a.Form)

1151

				if i == -1 {

1152

					// Ignore the token.

1153

					return true

1154

1155

				p.generateImpliedEndTags()

1156

				if p.oe[i].DataAtom != a.Form {

1157

					// Ignore the token.

1158

					return true

1159

1160

				p.popUntil(defaultScope, a.Form)

1161

			} else {

1162

				node := p.form

1163

				p.form = nil

1164

				i := p.indexOfElementInScope(defaultScope, a.Form)

1165

				if node == nil || i == -1 || p.oe[i] != node {

1166

					// Ignore the token.

1167

					return true

1168

1169

				p.generateImpliedEndTags()

1170

				p.oe.remove(node)

1171

1172

		case a.P:

1173

			if !p.elementInScope(buttonScope, a.P) {

1174

				p.parseImpliedToken(StartTagToken, a.P, a.P.String())

1175

1176

			p.popUntil(buttonScope, a.P)

1177

		case a.Li:

1178

			p.popUntil(listItemScope, a.Li)

1179

		case a.Dd, a.Dt:

1180

			p.popUntil(defaultScope, p.tok.DataAtom)

1181

		case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:

1182

			p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6)

1183

		case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:

1184

			p.inBodyEndTagFormatting(p.tok.DataAtom, p.tok.Data)

1185

		case a.Applet, a.Marquee, a.Object:

1186

			if p.popUntil(defaultScope, p.tok.DataAtom) {

1187

				p.clearActiveFormattingElements()

1188

1189

		case a.Br:

1190

			p.tok.Type = StartTagToken

1191

			return false

1192

		case a.Template:

1193

			return inHeadIM(p)

1194

		default:

1195

			p.inBodyEndTagOther(p.tok.DataAtom, p.tok.Data)

1196

1197

	case CommentToken:

1198

		p.addChild(&Node{

1199

			Type: CommentNode,

1200

			Data: p.tok.Data,

1201

})

1202

	case ErrorToken:

1203

		// TODO: remove this divergence from the HTML5 spec.

1204

		if len(p.templateStack) > 0 {

1205

			p.im = inTemplateIM

1206

			return false

1207

1208

		for _, e := range p.oe {

1209

			switch e.DataAtom {

1210

			case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc, a.Tbody, a.Td, a.Tfoot, a.Th,

1211

				a.Thead, a.Tr, a.Body, a.Html:

1212

			default:

1213

				return true

1214

1215

1216

1217

1218

	return true

1219

1220

1221

func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom, tagName string) {

1222

	// This is the "adoption agency" algorithm, described at

1223

	// https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency

1224

1225

	// TODO: this is a fairly literal line-by-line translation of that algorithm.

1226

	// Once the code successfully parses the comprehensive test suite, we should

1227

	// refactor this code to be more idiomatic.

1228

1229

	// Steps 1-2

1230

	if current := p.oe.top(); current.Data == tagName && p.afe.index(current) == -1 {

1231

		p.oe.pop()

1232

		return

1233

1234

1235

	// Steps 3-5. The outer loop.

1236

	for i := 0; i < 8; i++ {

1237

		// Step 6. Find the formatting element.

1238

		var formattingElement *Node

1239

		for j := len(p.afe) - 1; j >= 0; j-- {

1240

			if p.afe[j].Type == scopeMarkerNode {

1241

				break

1242

1243

			if p.afe[j].DataAtom == tagAtom {

1244

				formattingElement = p.afe[j]

1245

				break

1246

1247

1248

		if formattingElement == nil {

1249

			p.inBodyEndTagOther(tagAtom, tagName)

1250

			return

1251

1252

1253

		// Step 7. Ignore the tag if formatting element is not in the stack of open elements.

1254

		feIndex := p.oe.index(formattingElement)

1255

		if feIndex == -1 {

1256

			p.afe.remove(formattingElement)

1257

			return

1258

1259

		// Step 8. Ignore the tag if formatting element is not in the scope.

1260

		if !p.elementInScope(defaultScope, tagAtom) {

1261

			// Ignore the tag.

1262

			return

1263

1264

1265

		// Step 9. This step is omitted because it's just a parse error but no need to return.

1266

1267

		// Steps 10-11. Find the furthest block.

1268

		var furthestBlock *Node

1269

		for _, e := range p.oe[feIndex:] {

1270

			if isSpecialElement(e) {

1271

				furthestBlock = e

1272

				break

1273

1274

1275

		if furthestBlock == nil {

1276

			e := p.oe.pop()

1277

			for e != formattingElement {

1278

				e = p.oe.pop()

1279

1280

			p.afe.remove(e)

1281

			return

1282

1283

1284

		// Steps 12-13. Find the common ancestor and bookmark node.

1285

		commonAncestor := p.oe[feIndex-1]

1286

		bookmark := p.afe.index(formattingElement)

1287

1288

		// Step 14. The inner loop. Find the lastNode to reparent.

1289

		lastNode := furthestBlock

1290

		node := furthestBlock

1291

		x := p.oe.index(node)

1292

		// Step 14.1.

1293

		j := 0

1294

		for {

1295

			// Step 14.2.

1296

j++

1297

			// Step. 14.3.

1298

x--

1299

			node = p.oe[x]

1300

			// Step 14.4. Go to the next step if node is formatting element.

1301

			if node == formattingElement {

1302

				break

1303

1304

			// Step 14.5. Remove node from the list of active formatting elements if

1305

			// inner loop counter is greater than three and node is in the list of

1306

			// active formatting elements.

1307

			if ni := p.afe.index(node); j > 3 && ni > -1 {

1308

				p.afe.remove(node)

1309

				// If any element of the list of active formatting elements is removed,

1310

				// we need to take care whether bookmark should be decremented or not.

1311

				// This is because the value of bookmark may exceed the size of the

1312

				// list by removing elements from the list.

1313

				if ni <= bookmark {

1314

					bookmark--

1315

1316

				continue

1317

1318

			// Step 14.6. Continue the next inner loop if node is not in the list of

1319

			// active formatting elements.

1320

			if p.afe.index(node) == -1 {

1321

				p.oe.remove(node)

1322

				continue

1323

1324

			// Step 14.7.

1325

			clone := node.clone()

1326

			p.afe[p.afe.index(node)] = clone

1327

			p.oe[p.oe.index(node)] = clone

1328

			node = clone

1329

			// Step 14.8.

1330

			if lastNode == furthestBlock {

1331

				bookmark = p.afe.index(node) + 1

1332

1333

			// Step 14.9.

1334

			if lastNode.Parent != nil {

1335

				lastNode.Parent.RemoveChild(lastNode)

1336

1337

			node.AppendChild(lastNode)

1338

			// Step 14.10.

1339

			lastNode = node

1340

1341

1342

		// Step 15. Reparent lastNode to the common ancestor,

1343

		// or for misnested table nodes, to the foster parent.

1344

		if lastNode.Parent != nil {

1345

			lastNode.Parent.RemoveChild(lastNode)

1346

1347

		switch commonAncestor.DataAtom {

1348

		case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:

1349

			p.fosterParent(lastNode)

1350

		default:

1351

			commonAncestor.AppendChild(lastNode)

1352

1353

1354

		// Steps 16-18. Reparent nodes from the furthest block's children

1355

		// to a clone of the formatting element.

1356

		clone := formattingElement.clone()

1357

		reparentChildren(clone, furthestBlock)

1358

		furthestBlock.AppendChild(clone)

1359

1360

		// Step 19. Fix up the list of active formatting elements.

1361

		if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark {

1362

			// Move the bookmark with the rest of the list.

1363

			bookmark--

1364

1365

		p.afe.remove(formattingElement)

1366

		p.afe.insert(bookmark, clone)

1367

1368

		// Step 20. Fix up the stack of open elements.

1369

		p.oe.remove(formattingElement)

1370

		p.oe.insert(p.oe.index(furthestBlock)+1, clone)

1371

1372

1373

1374

// inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM.

1375

// "Any other end tag" handling from 12.2.6.5 The rules for parsing tokens in foreign content

1376

// https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign

1377

func (p *parser) inBodyEndTagOther(tagAtom a.Atom, tagName string) {

1378

	for i := len(p.oe) - 1; i >= 0; i-- {

1379

		// Two element nodes have the same tag if they have the same Data (a

1380

		// string-typed field). As an optimization, for common HTML tags, each

1381

		// Data string is assigned a unique, non-zero DataAtom (a uint32-typed

1382

		// field), since integer comparison is faster than string comparison.

1383

		// Uncommon (custom) tags get a zero DataAtom.

1384

//

1385

		// The if condition here is equivalent to (p.oe[i].Data == tagName).

1386

		if (p.oe[i].DataAtom == tagAtom) &&

1387

			((tagAtom != 0) || (p.oe[i].Data == tagName)) {

1388

			p.oe = p.oe[:i]

1389

			break

1390

1391

		if isSpecialElement(p.oe[i]) {

1392

			break

1393

1394

1395

1396

1397

// Section 12.2.6.4.8.

1398

func textIM(p *parser) bool {

1399

	switch p.tok.Type {

1400

	case ErrorToken:

1401

		p.oe.pop()

1402

	case TextToken:

1403

		d := p.tok.Data

1404

		if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil {

1405

			// Ignore a newline at the start of a <textarea> block.

1406

			if d != "" && d[0] == '\r' {

1407

				d = d[1:]

1408

1409

			if d != "" && d[0] == '\n' {

1410

				d = d[1:]

1411

1412

1413

		if d == "" {

1414

			return true

1415

1416

		p.addText(d)

1417

		return true

1418

	case EndTagToken:

1419

		p.oe.pop()

1420

1421

	p.im = p.originalIM

1422

	p.originalIM = nil

1423

	return p.tok.Type == EndTagToken

1424

1425

1426

// Section 12.2.6.4.9.

1427

func inTableIM(p *parser) bool {

1428

	switch p.tok.Type {

1429

	case TextToken:

1430

		p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1)

1431

		switch p.oe.top().DataAtom {

1432

		case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:

1433

			if strings.Trim(p.tok.Data, whitespace) == "" {

1434

				p.addText(p.tok.Data)

1435

				return true

1436

1437

1438

	case StartTagToken:

1439

		switch p.tok.DataAtom {

1440

		case a.Caption:

1441

			p.clearStackToContext(tableScope)

1442

			p.afe = append(p.afe, &scopeMarker)

1443

			p.addElement()

1444

			p.im = inCaptionIM

1445

			return true

1446

		case a.Colgroup:

1447

			p.clearStackToContext(tableScope)

1448

			p.addElement()

1449

			p.im = inColumnGroupIM

1450

			return true

1451

		case a.Col:

1452

			p.parseImpliedToken(StartTagToken, a.Colgroup, a.Colgroup.String())

1453

			return false

1454

		case a.Tbody, a.Tfoot, a.Thead:

1455

			p.clearStackToContext(tableScope)

1456

			p.addElement()

1457

			p.im = inTableBodyIM

1458

			return true

1459

		case a.Td, a.Th, a.Tr:

1460

			p.parseImpliedToken(StartTagToken, a.Tbody, a.Tbody.String())

1461

			return false

1462

		case a.Table:

1463

			if p.popUntil(tableScope, a.Table) {

1464

				p.resetInsertionMode()

1465

				return false

1466

1467

			// Ignore the token.

1468

			return true

1469

		case a.Style, a.Script, a.Template:

1470

			return inHeadIM(p)

1471

		case a.Input:

1472

			for _, t := range p.tok.Attr {

1473

				if t.Key == "type" && strings.EqualFold(t.Val, "hidden") {

1474

					p.addElement()

1475

					p.oe.pop()

1476

					return true

1477

1478

1479

			// Otherwise drop down to the default action.

1480

		case a.Form:

1481

			if p.oe.contains(a.Template) || p.form != nil {

1482

				// Ignore the token.

1483

				return true

1484

1485

			p.addElement()

1486

			p.form = p.oe.pop()

1487

		case a.Select:

1488

			p.reconstructActiveFormattingElements()

1489

			switch p.top().DataAtom {

1490

			case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:

1491

				p.fosterParenting = true

1492

1493

			p.addElement()

1494

			p.fosterParenting = false

1495

			p.framesetOK = false

1496

			p.im = inSelectInTableIM

1497

			return true

1498

1499

	case EndTagToken:

1500

		switch p.tok.DataAtom {

1501

		case a.Table:

1502

			if p.popUntil(tableScope, a.Table) {

1503

				p.resetInsertionMode()

1504

				return true

1505

1506

			// Ignore the token.

1507

			return true

1508

		case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:

1509

			// Ignore the token.

1510

			return true

1511

		case a.Template:

1512

			return inHeadIM(p)

1513

1514

	case CommentToken:

1515

		p.addChild(&Node{

1516

			Type: CommentNode,

1517

			Data: p.tok.Data,

1518

})

1519

		return true

1520

	case DoctypeToken:

1521

		// Ignore the token.

1522

		return true

1523

	case ErrorToken:

1524

		return inBodyIM(p)

1525

1526

1527

	p.fosterParenting = true

1528

	defer func() { p.fosterParenting = false }()

1529

1530

	return inBodyIM(p)

1531

1532

1533

// Section 12.2.6.4.11.

1534

func inCaptionIM(p *parser) bool {

1535

	switch p.tok.Type {

1536

	case StartTagToken:

1537

		switch p.tok.DataAtom {

1538

		case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Thead, a.Tr:

1539

			if !p.popUntil(tableScope, a.Caption) {

1540

				// Ignore the token.

1541

				return true

1542

1543

			p.clearActiveFormattingElements()

1544

			p.im = inTableIM

1545

			return false

1546

		case a.Select:

1547

			p.reconstructActiveFormattingElements()

1548

			p.addElement()

1549

			p.framesetOK = false

1550

			p.im = inSelectInTableIM

1551

			return true

1552

1553

	case EndTagToken:

1554

		switch p.tok.DataAtom {

1555

		case a.Caption:

1556

			if p.popUntil(tableScope, a.Caption) {

1557

				p.clearActiveFormattingElements()

1558

				p.im = inTableIM

1559

1560

			return true

1561

		case a.Table:

1562

			if !p.popUntil(tableScope, a.Caption) {

1563

				// Ignore the token.

1564

				return true

1565

1566

			p.clearActiveFormattingElements()

1567

			p.im = inTableIM

1568

			return false

1569

		case a.Body, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:

1570

			// Ignore the token.

1571

			return true

1572

1573

1574

	return inBodyIM(p)

1575

1576

1577

// Section 12.2.6.4.12.

1578

func inColumnGroupIM(p *parser) bool {

1579

	switch p.tok.Type {

1580

	case TextToken:

1581

		s := strings.TrimLeft(p.tok.Data, whitespace)

1582

		if len(s) < len(p.tok.Data) {

1583

			// Add the initial whitespace to the current node.

1584

			p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])

1585

			if s == "" {

1586

				return true

1587

1588

			p.tok.Data = s

1589

1590

	case CommentToken:

1591

		p.addChild(&Node{

1592

			Type: CommentNode,

1593

			Data: p.tok.Data,

1594

})

1595

		return true

1596

	case DoctypeToken:

1597

		// Ignore the token.

1598

		return true

1599

	case StartTagToken:

1600

		switch p.tok.DataAtom {

1601

		case a.Html:

1602

			return inBodyIM(p)

1603

		case a.Col:

1604

			p.addElement()

1605

			p.oe.pop()

1606

			p.acknowledgeSelfClosingTag()

1607

			return true

1608

		case a.Template:

1609

			return inHeadIM(p)

1610

1611

	case EndTagToken:

1612

		switch p.tok.DataAtom {

1613

		case a.Colgroup:

1614

			if p.oe.top().DataAtom == a.Colgroup {

1615

				p.oe.pop()

1616

				p.im = inTableIM

1617

1618

			return true

1619

		case a.Col:

1620

			// Ignore the token.

1621

			return true

1622

		case a.Template:

1623

			return inHeadIM(p)

1624

1625

	case ErrorToken:

1626

		return inBodyIM(p)

1627

1628

	if p.oe.top().DataAtom != a.Colgroup {

1629

		return true

1630

1631

	p.oe.pop()

1632

	p.im = inTableIM

1633

	return false

1634

1635

1636

// Section 12.2.6.4.13.

1637

func inTableBodyIM(p *parser) bool {

1638

	switch p.tok.Type {

1639

	case StartTagToken:

1640

		switch p.tok.DataAtom {

1641

		case a.Tr:

1642

			p.clearStackToContext(tableBodyScope)

1643

			p.addElement()

1644

			p.im = inRowIM

1645

			return true

1646

		case a.Td, a.Th:

1647

			p.parseImpliedToken(StartTagToken, a.Tr, a.Tr.String())

1648

			return false

1649

		case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead:

1650

			if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) {

1651

				p.im = inTableIM

1652

				return false

1653

1654

			// Ignore the token.

1655

			return true

1656

1657

	case EndTagToken:

1658

		switch p.tok.DataAtom {

1659

		case a.Tbody, a.Tfoot, a.Thead:

1660

			if p.elementInScope(tableScope, p.tok.DataAtom) {

1661

				p.clearStackToContext(tableBodyScope)

1662

				p.oe.pop()

1663

				p.im = inTableIM

1664

1665

			return true

1666

		case a.Table:

1667

			if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) {

1668

				p.im = inTableIM

1669

				return false

1670

1671

			// Ignore the token.

1672

			return true

1673

		case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th, a.Tr:

1674

			// Ignore the token.

1675

			return true

1676

1677

	case CommentToken:

1678

		p.addChild(&Node{

1679

			Type: CommentNode,

1680

			Data: p.tok.Data,

1681

})

1682

		return true

1683

1684

1685

	return inTableIM(p)

1686

1687

1688

// Section 13.2.6.4.14.

1689

func inRowIM(p *parser) bool {

1690

	switch p.tok.Type {

1691

	case StartTagToken:

1692

		switch p.tok.DataAtom {

1693

		case a.Td, a.Th:

1694

			p.clearStackToContext(tableRowScope)

1695

			p.addElement()

1696

			p.afe = append(p.afe, &scopeMarker)

1697

			p.im = inCellIM

1698

			return true

1699

		case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead, a.Tr:

1700

			if p.elementInScope(tableScope, a.Tr) {

1701

				p.clearStackToContext(tableRowScope)

1702

				p.oe.pop()

1703

				p.im = inTableBodyIM

1704

				return false

1705

1706

			// Ignore the token.

1707

			return true

1708

1709

	case EndTagToken:

1710

		switch p.tok.DataAtom {

1711

		case a.Tr:

1712

			if p.elementInScope(tableScope, a.Tr) {

1713

				p.clearStackToContext(tableRowScope)

1714

				p.oe.pop()

1715

				p.im = inTableBodyIM

1716

				return true

1717

1718

			// Ignore the token.

1719

			return true

1720

		case a.Table:

1721

			if p.elementInScope(tableScope, a.Tr) {

1722

				p.clearStackToContext(tableRowScope)

1723

				p.oe.pop()

1724

				p.im = inTableBodyIM

1725

				return false

1726

1727

			// Ignore the token.

1728

			return true

1729

		case a.Tbody, a.Tfoot, a.Thead:

1730

			if p.elementInScope(tableScope, p.tok.DataAtom) && p.elementInScope(tableScope, a.Tr) {

1731

				p.clearStackToContext(tableRowScope)

1732

				p.oe.pop()

1733

				p.im = inTableBodyIM

1734

				return false

1735

1736

			// Ignore the token.

1737

			return true

1738

		case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th:

1739

			// Ignore the token.

1740

			return true

1741

1742

1743

1744

	return inTableIM(p)

1745

1746

1747

// Section 12.2.6.4.15.

1748

func inCellIM(p *parser) bool {

1749

	switch p.tok.Type {

1750

	case StartTagToken:

1751

		switch p.tok.DataAtom {

1752

		case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:

1753

			if p.popUntil(tableScope, a.Td, a.Th) {

1754

				// Close the cell and reprocess.

1755

				p.clearActiveFormattingElements()

1756

				p.im = inRowIM

1757

				return false

1758

1759

			// Ignore the token.

1760

			return true

1761

		case a.Select:

1762

			p.reconstructActiveFormattingElements()

1763

			p.addElement()

1764

			p.framesetOK = false

1765

			p.im = inSelectInTableIM

1766

			return true

1767

1768

	case EndTagToken:

1769

		switch p.tok.DataAtom {

1770

		case a.Td, a.Th:

1771

			if !p.popUntil(tableScope, p.tok.DataAtom) {

1772

				// Ignore the token.

1773

				return true

1774

1775

			p.clearActiveFormattingElements()

1776

			p.im = inRowIM

1777

			return true

1778

		case a.Body, a.Caption, a.Col, a.Colgroup, a.Html:

1779

			// Ignore the token.

1780

			return true

1781

		case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:

1782

			if !p.elementInScope(tableScope, p.tok.DataAtom) {

1783

				// Ignore the token.

1784

				return true

1785

1786

			// Close the cell and reprocess.

1787

			if p.popUntil(tableScope, a.Td, a.Th) {

1788

				p.clearActiveFormattingElements()

1789

1790

			p.im = inRowIM

1791

			return false

1792

1793

1794

	return inBodyIM(p)

1795

1796

1797

// Section 12.2.6.4.16.

1798

func inSelectIM(p *parser) bool {

1799

	switch p.tok.Type {

1800

	case TextToken:

1801

		p.addText(strings.Replace(p.tok.Data, "\x00", "", -1))

1802

	case StartTagToken:

1803

		switch p.tok.DataAtom {

1804

		case a.Html:

1805

			return inBodyIM(p)

1806

		case a.Option:

1807

			if p.top().DataAtom == a.Option {

1808

				p.oe.pop()

1809

1810

			p.addElement()

1811

		case a.Optgroup:

1812

			if p.top().DataAtom == a.Option {

1813

				p.oe.pop()

1814

1815

			if p.top().DataAtom == a.Optgroup {

1816

				p.oe.pop()

1817

1818

			p.addElement()

1819

		case a.Select:

1820

			if !p.popUntil(selectScope, a.Select) {

1821

				// Ignore the token.

1822

				return true

1823

1824

			p.resetInsertionMode()

1825

		case a.Input, a.Keygen, a.Textarea:

1826

			if p.elementInScope(selectScope, a.Select) {

1827

				p.parseImpliedToken(EndTagToken, a.Select, a.Select.String())

1828

				return false

1829

1830

			// In order to properly ignore <textarea>, we need to change the tokenizer mode.

1831

			p.tokenizer.NextIsNotRawText()

1832

			// Ignore the token.

1833

			return true

1834

		case a.Script, a.Template:

1835

			return inHeadIM(p)

1836

		case a.Iframe, a.Noembed, a.Noframes, a.Noscript, a.Plaintext, a.Style, a.Title, a.Xmp:

1837

			// Don't let the tokenizer go into raw text mode when there are raw tags

1838

			// to be ignored. These tags should be ignored from the tokenizer

1839

			// properly.

1840

			p.tokenizer.NextIsNotRawText()

1841

			// Ignore the token.

1842

			return true

1843

1844

	case EndTagToken:

1845

		switch p.tok.DataAtom {

1846

		case a.Option:

1847

			if p.top().DataAtom == a.Option {

1848

				p.oe.pop()

1849

1850

		case a.Optgroup:

1851

			i := len(p.oe) - 1

1852

			if p.oe[i].DataAtom == a.Option {

1853

i--

1854

1855

			if p.oe[i].DataAtom == a.Optgroup {

1856

				p.oe = p.oe[:i]

1857

1858

		case a.Select:

1859

			if !p.popUntil(selectScope, a.Select) {

1860

				// Ignore the token.

1861

				return true

1862

1863

			p.resetInsertionMode()

1864

		case a.Template:

1865

			return inHeadIM(p)

1866

1867

	case CommentToken:

1868

		p.addChild(&Node{

1869

			Type: CommentNode,

1870

			Data: p.tok.Data,

1871

})

1872

	case DoctypeToken:

1873

		// Ignore the token.

1874

		return true

1875

	case ErrorToken:

1876

		return inBodyIM(p)

1877

1878

1879

	return true

1880

1881

1882

// Section 12.2.6.4.17.

1883

func inSelectInTableIM(p *parser) bool {

1884

	switch p.tok.Type {

1885

	case StartTagToken, EndTagToken:

1886

		switch p.tok.DataAtom {

1887

		case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th:

1888

			if p.tok.Type == EndTagToken && !p.elementInScope(tableScope, p.tok.DataAtom) {

1889

				// Ignore the token.

1890

				return true

1891

1892

			// This is like p.popUntil(selectScope, a.Select), but it also

1893

			// matches <math select>, not just <select>. Matching the MathML

1894

			// tag is arguably incorrect (conceptually), but it mimics what

1895

			// Chromium does.

1896

			for i := len(p.oe) - 1; i >= 0; i-- {

1897

				if n := p.oe[i]; n.DataAtom == a.Select {

1898

					p.oe = p.oe[:i]

1899

					break

1900

1901

1902

			p.resetInsertionMode()

1903

			return false

1904

1905

1906

	return inSelectIM(p)

1907

1908

1909

// Section 12.2.6.4.18.

1910

func inTemplateIM(p *parser) bool {

1911

	switch p.tok.Type {

1912

	case TextToken, CommentToken, DoctypeToken:

1913

		return inBodyIM(p)

1914

	case StartTagToken:

1915

		switch p.tok.DataAtom {

1916

		case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:

1917

			return inHeadIM(p)

1918

		case a.Caption, a.Colgroup, a.Tbody, a.Tfoot, a.Thead:

1919

			p.templateStack.pop()

1920

			p.templateStack = append(p.templateStack, inTableIM)

1921

			p.im = inTableIM

1922

			return false

1923

		case a.Col:

1924

			p.templateStack.pop()

1925

			p.templateStack = append(p.templateStack, inColumnGroupIM)

1926

			p.im = inColumnGroupIM

1927

			return false

1928

		case a.Tr:

1929

			p.templateStack.pop()

1930

			p.templateStack = append(p.templateStack, inTableBodyIM)

1931

			p.im = inTableBodyIM

1932

			return false

1933

		case a.Td, a.Th:

1934

			p.templateStack.pop()

1935

			p.templateStack = append(p.templateStack, inRowIM)

1936

			p.im = inRowIM

1937

			return false

1938

		default:

1939

			p.templateStack.pop()

1940

			p.templateStack = append(p.templateStack, inBodyIM)

1941

			p.im = inBodyIM

1942

			return false

1943

1944

	case EndTagToken:

1945

		switch p.tok.DataAtom {

1946

		case a.Template:

1947

			return inHeadIM(p)

1948

		default:

1949

			// Ignore the token.

1950

			return true

1951

1952

	case ErrorToken:

1953

		if !p.oe.contains(a.Template) {

1954

			// Ignore the token.

1955

			return true

1956

1957

		// TODO: remove this divergence from the HTML5 spec.

1958

//

1959

		// See https://bugs.chromium.org/p/chromium/issues/detail?id=829668

1960

		p.generateImpliedEndTags()

1961

		for i := len(p.oe) - 1; i >= 0; i-- {

1962

			if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template {

1963

				p.oe = p.oe[:i]

1964

				break

1965

1966

1967

		p.clearActiveFormattingElements()

1968

		p.templateStack.pop()

1969

		p.resetInsertionMode()

1970

		return false

1971

1972

	return false

1973

1974

1975

// Section 12.2.6.4.19.

1976

func afterBodyIM(p *parser) bool {

1977

	switch p.tok.Type {

1978

	case ErrorToken:

1979

		// Stop parsing.

1980

		return true

1981

	case TextToken:

1982

		s := strings.TrimLeft(p.tok.Data, whitespace)

1983

		if len(s) == 0 {

1984

			// It was all whitespace.

1985

			return inBodyIM(p)

1986

1987

	case StartTagToken:

1988

		if p.tok.DataAtom == a.Html {

1989

			return inBodyIM(p)

1990

1991

	case EndTagToken:

1992

		if p.tok.DataAtom == a.Html {

1993

			if !p.fragment {

1994

				p.im = afterAfterBodyIM

1995

1996

			return true

1997

1998

	case CommentToken:

1999

		// The comment is attached to the <html> element.

2000

		if len(p.oe) < 1 || p.oe[0].DataAtom != a.Html {

2001

			panic("html: bad parser state: <html> element not found, in the after-body insertion mode")

2002

2003

		p.oe[0].AppendChild(&Node{

2004

			Type: CommentNode,

2005

			Data: p.tok.Data,

2006

})

2007

		return true

2008

2009

	p.im = inBodyIM

2010

	return false

2011

2012

2013

// Section 12.2.6.4.20.

2014

func inFramesetIM(p *parser) bool {

2015

	switch p.tok.Type {

2016

	case CommentToken:

2017

		p.addChild(&Node{

2018

			Type: CommentNode,

2019

			Data: p.tok.Data,

2020

})

2021

	case TextToken:

2022

		// Ignore all text but whitespace.

2023

		s := strings.Map(func(c rune) rune {

2024

			switch c {

2025

			case ' ', '\t', '\n', '\f', '\r':

2026

				return c

2027

2028

			return -1

2029

		}, p.tok.Data)

2030

		if s != "" {

2031

			p.addText(s)

2032

2033

	case StartTagToken:

2034

		switch p.tok.DataAtom {

2035

		case a.Html:

2036

			return inBodyIM(p)

2037

		case a.Frameset:

2038

			p.addElement()

2039

		case a.Frame:

2040

			p.addElement()

2041

			p.oe.pop()

2042

			p.acknowledgeSelfClosingTag()

2043

		case a.Noframes:

2044

			return inHeadIM(p)

2045

2046

	case EndTagToken:

2047

		switch p.tok.DataAtom {

2048

		case a.Frameset:

2049

			if p.oe.top().DataAtom != a.Html {

2050

				p.oe.pop()

2051

				if p.oe.top().DataAtom != a.Frameset {

2052

					p.im = afterFramesetIM

2053

					return true

2054

2055

2056

2057

	default:

2058

		// Ignore the token.

2059

2060

	return true

2061

2062

2063

// Section 12.2.6.4.21.

2064

func afterFramesetIM(p *parser) bool {

2065

	switch p.tok.Type {

2066

	case CommentToken:

2067

		p.addChild(&Node{

2068

			Type: CommentNode,

2069

			Data: p.tok.Data,

2070

})

2071

	case TextToken:

2072

		// Ignore all text but whitespace.

2073

		s := strings.Map(func(c rune) rune {

2074

			switch c {

2075

			case ' ', '\t', '\n', '\f', '\r':

2076

				return c

2077

2078

			return -1

2079

		}, p.tok.Data)

2080

		if s != "" {

2081

			p.addText(s)

2082

2083

	case StartTagToken:

2084

		switch p.tok.DataAtom {

2085

		case a.Html:

2086

			return inBodyIM(p)

2087

		case a.Noframes:

2088

			return inHeadIM(p)

2089

2090

	case EndTagToken:

2091

		switch p.tok.DataAtom {

2092

		case a.Html:

2093

			p.im = afterAfterFramesetIM

2094

			return true

2095

2096

	default:

2097

		// Ignore the token.

2098

2099

	return true

2100

2101

2102

// Section 12.2.6.4.22.

2103

func afterAfterBodyIM(p *parser) bool {

2104

	switch p.tok.Type {

2105

	case ErrorToken:

2106

		// Stop parsing.

2107

		return true

2108

	case TextToken:

2109

		s := strings.TrimLeft(p.tok.Data, whitespace)

2110

		if len(s) == 0 {

2111

			// It was all whitespace.

2112

			return inBodyIM(p)

2113

2114

	case StartTagToken:

2115

		if p.tok.DataAtom == a.Html {

2116

			return inBodyIM(p)

2117

2118

	case CommentToken:

2119

		p.doc.AppendChild(&Node{

2120

			Type: CommentNode,

2121

			Data: p.tok.Data,

2122

})

2123

		return true

2124

	case DoctypeToken:

2125

		return inBodyIM(p)

2126

2127

	p.im = inBodyIM

2128

	return false

2129

2130

2131

// Section 12.2.6.4.23.

2132

func afterAfterFramesetIM(p *parser) bool {

2133

	switch p.tok.Type {

2134

	case CommentToken:

2135

		p.doc.AppendChild(&Node{

2136

			Type: CommentNode,

2137

			Data: p.tok.Data,

2138

})

2139

	case TextToken:

2140

		// Ignore all text but whitespace.

2141

		s := strings.Map(func(c rune) rune {

2142

			switch c {

2143

			case ' ', '\t', '\n', '\f', '\r':

2144

				return c

2145

2146

			return -1

2147

		}, p.tok.Data)

2148

		if s != "" {

2149

			p.tok.Data = s

2150

			return inBodyIM(p)

2151

2152

	case StartTagToken:

2153

		switch p.tok.DataAtom {

2154

		case a.Html:

2155

			return inBodyIM(p)

2156

		case a.Noframes:

2157

			return inHeadIM(p)

2158

2159

	case DoctypeToken:

2160

		return inBodyIM(p)

2161

	default:

2162

		// Ignore the token.

2163

2164

	return true

2165

2166

2167

func ignoreTheRemainingTokens(p *parser) bool {

2168

	return true

2169

2170

2171

const whitespaceOrNUL = whitespace + "\x00"

2172

2173

// Section 12.2.6.5

2174

func parseForeignContent(p *parser) bool {

2175

	switch p.tok.Type {

2176

	case TextToken:

2177

		if p.framesetOK {

2178

			p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == ""

2179

2180

		p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1)

2181

		p.addText(p.tok.Data)

2182

	case CommentToken:

2183

		p.addChild(&Node{

2184

			Type: CommentNode,

2185

			Data: p.tok.Data,

2186

})

2187

	case StartTagToken:

2188

		if !p.fragment {

2189

			b := breakout[p.tok.Data]

2190

			if p.tok.DataAtom == a.Font {

2191

			loop:

2192

				for _, attr := range p.tok.Attr {

2193

					switch attr.Key {

2194

					case "color", "face", "size":

2195

						b = true

2196

						break loop

2197

2198

2199

2200

			if b {

2201

				for i := len(p.oe) - 1; i >= 0; i-- {

2202

					n := p.oe[i]

2203

					if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) {

2204

						p.oe = p.oe[:i+1]

2205

						break

2206

2207

2208

				return false

2209

2210

2211

		current := p.adjustedCurrentNode()

2212

		switch current.Namespace {

2213

		case "math":

2214

			adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)

2215

		case "svg":

2216

			// Adjust SVG tag names. The tokenizer lower-cases tag names, but

2217

			// SVG wants e.g. "foreignObject" with a capital second "O".

2218

			if x := svgTagNameAdjustments[p.tok.Data]; x != "" {

2219

				p.tok.DataAtom = a.Lookup([]byte(x))

2220

				p.tok.Data = x

2221

2222

			adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)

2223

		default:

2224

			panic("html: bad parser state: unexpected namespace")

2225

2226

		adjustForeignAttributes(p.tok.Attr)

2227

		namespace := current.Namespace

2228

		p.addElement()

2229

		p.top().Namespace = namespace

2230

		if namespace != "" {

2231

			// Don't let the tokenizer go into raw text mode in foreign content

2232

			// (e.g. in an SVG <title> tag).

2233

			p.tokenizer.NextIsNotRawText()

2234

2235

		if p.hasSelfClosingToken {

2236

			p.oe.pop()

2237

			p.acknowledgeSelfClosingTag()

2238

2239

	case EndTagToken:

2240

		if strings.EqualFold(p.oe[len(p.oe)-1].Data, p.tok.Data) {

2241

			p.oe = p.oe[:len(p.oe)-1]

2242

			return true

2243

2244

		for i := len(p.oe) - 1; i >= 0; i-- {

2245

			if strings.EqualFold(p.oe[i].Data, p.tok.Data) {

2246

				p.oe = p.oe[:i]

2247

				return true

2248

2249

			if i > 0 && p.oe[i-1].Namespace == "" {

2250

				break

2251

2252

2253

		return p.im(p)

2254

	default:

2255

		// Ignore the token.

2256

2257

	return true

2258

2259

2260

// Section 12.2.4.2.

2261

func (p *parser) adjustedCurrentNode() *Node {

2262

	if len(p.oe) == 1 && p.fragment && p.context != nil {

2263

		return p.context

2264

2265

	return p.oe.top()

2266

2267

2268

// Section 12.2.6.

2269

func (p *parser) inForeignContent() bool {

2270

	if len(p.oe) == 0 {

2271

		return false

2272

2273

	n := p.adjustedCurrentNode()

2274

	if n.Namespace == "" {

2275

		return false

2276

2277

	if mathMLTextIntegrationPoint(n) {

2278

		if p.tok.Type == StartTagToken && p.tok.DataAtom != a.Mglyph && p.tok.DataAtom != a.Malignmark {

2279

			return false

2280

2281

		if p.tok.Type == TextToken {

2282

			return false

2283

2284

2285

	if n.Namespace == "math" && n.DataAtom == a.AnnotationXml && p.tok.Type == StartTagToken && p.tok.DataAtom == a.Svg {

2286

		return false

2287

2288

	if htmlIntegrationPoint(n) && (p.tok.Type == StartTagToken || p.tok.Type == TextToken) {

2289

		return false

2290

2291

	if p.tok.Type == ErrorToken {

2292

		return false

2293

2294

	return true

2295

2296

2297

// parseImpliedToken parses a token as though it had appeared in the parser's

2298

// input.

2299

func (p *parser) parseImpliedToken(t TokenType, dataAtom a.Atom, data string) {

2300

	realToken, selfClosing := p.tok, p.hasSelfClosingToken

2301

	p.tok = Token{

2302

		Type:     t,

2303

		DataAtom: dataAtom,

2304

		Data:     data,

2305

2306

	p.hasSelfClosingToken = false

2307

	p.parseCurrentToken()

2308

	p.tok, p.hasSelfClosingToken = realToken, selfClosing

2309

2310

2311

// parseCurrentToken runs the current token through the parsing routines

2312

// until it is consumed.

2313

func (p *parser) parseCurrentToken() {

2314

	if p.tok.Type == SelfClosingTagToken {

2315

		p.hasSelfClosingToken = true

2316

		p.tok.Type = StartTagToken

2317

2318

2319

	consumed := false

2320

	for !consumed {

2321

		if p.inForeignContent() {

2322

			consumed = parseForeignContent(p)

2323

		} else {

2324

			consumed = p.im(p)

2325

2326

2327

2328

	if p.hasSelfClosingToken {

2329

		// This is a parse error, but ignore it.

2330

		p.hasSelfClosingToken = false

2331

2332

2333

2334

func (p *parser) parse() (err error) {

2335

	defer func() {

2336

		if panicErr := recover(); panicErr != nil {

2337

			err = fmt.Errorf("%s", panicErr)

2338

2339

}()

2340

	// Iterate until EOF. Any other error will cause an early return.

2341

	for err != io.EOF {

2342

		// CDATA sections are allowed only in foreign content.

2343

		n := p.oe.top()

2344

		p.tokenizer.AllowCDATA(n != nil && n.Namespace != "")

2345

		// Read and parse the next token.

2346

		p.tokenizer.Next()

2347

		p.tok = p.tokenizer.Token()

2348

		if p.tok.Type == ErrorToken {

2349

			err = p.tokenizer.Err()

2350

			if err != nil && err != io.EOF {

2351

				return err

2352

2353

2354

		p.parseCurrentToken()

2355

2356

	return nil

2357

2358

2359

// Parse returns the parse tree for the HTML from the given Reader.

2360

//

2361

// It implements the HTML5 parsing algorithm

2362

// (https://html.spec.whatwg.org/multipage/syntax.html#tree-construction),

2363

// which is very complicated. The resultant tree can contain implicitly created

2364

// nodes that have no explicit <tag> listed in r's data, and nodes' parents can

2365

// differ from the nesting implied by a naive processing of start and end

2366

// <tag>s. Conversely, explicit <tag>s in r's data can be silently dropped,

2367

// with no corresponding node in the resulting tree.

2368

//

2369

// Parse will reject HTML that is nested deeper than 512 elements.

2370

//

2371

// The input is assumed to be UTF-8 encoded.

2372

func Parse(r io.Reader) (*Node, error) {

2373

	return ParseWithOptions(r)

2374

2375

2376

// ParseFragment parses a fragment of HTML and returns the nodes that were

2377

// found. If the fragment is the InnerHTML for an existing element, pass that

2378

// element in context.

2379

//

2380

// It has the same intricacies as Parse.

2381

func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {

2382

	return ParseFragmentWithOptions(r, context)

2383

2384

2385

// ParseOption configures a parser.

2386

type ParseOption func(p *parser)

2387

2388

// ParseOptionEnableScripting configures the scripting flag.

2389

// https://html.spec.whatwg.org/multipage/webappapis.html#enabling-and-disabling-scripting

2390

//

2391

// By default, scripting is enabled.

2392

func ParseOptionEnableScripting(enable bool) ParseOption {

2393

	return func(p *parser) {

2394

		p.scripting = enable

2395

2396

2397

2398

// ParseWithOptions is like Parse, with options.

2399

func ParseWithOptions(r io.Reader, opts ...ParseOption) (*Node, error) {

2400

	p := &parser{

2401

		tokenizer: NewTokenizer(r),

2402

		doc: &Node{

2403

			Type: DocumentNode,

2404

},

2405

		scripting:  true,

2406

		framesetOK: true,

2407

		im:         initialIM,

2408

2409

2410

	for _, f := range opts {

2411

		f(p)

2412

2413

2414

	if err := p.parse(); err != nil {

2415

		return nil, err

2416

2417

	return p.doc, nil

2418

2419

2420

// ParseFragmentWithOptions is like ParseFragment, with options.

2421

func ParseFragmentWithOptions(r io.Reader, context *Node, opts ...ParseOption) ([]*Node, error) {

2422

	contextTag := ""

2423

	if context != nil {

2424

		if context.Type != ElementNode {

2425

			return nil, errors.New("html: ParseFragment of non-element Node")

2426

2427

		// The next check isn't just context.DataAtom.String() == context.Data because

2428

		// it is valid to pass an element whose tag isn't a known atom. For example,

2429

		// DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent.

2430

		if context.DataAtom != a.Lookup([]byte(context.Data)) {

2431

			return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data)

2432

2433

		contextTag = context.DataAtom.String()

2434

2435

	p := &parser{

2436

		doc: &Node{

2437

			Type: DocumentNode,

2438

},

2439

		scripting: true,

2440

		fragment:  true,

2441

		context:   context,

2442

2443

	if context != nil && context.Namespace != "" {

2444

		p.tokenizer = NewTokenizer(r)

2445

	} else {

2446

		p.tokenizer = NewTokenizerFragment(r, contextTag)

2447

2448

2449

	for _, f := range opts {

2450

		f(p)

2451

2452

2453

	root := &Node{

2454

		Type:     ElementNode,

2455

		DataAtom: a.Html,

2456

		Data:     a.Html.String(),

2457

2458

	p.doc.AppendChild(root)

2459

	p.oe = nodeStack{root}

2460

	if context != nil && context.DataAtom == a.Template {

2461

		p.templateStack = append(p.templateStack, inTemplateIM)

2462

2463

	p.resetInsertionMode()

2464

2465

	for n := context; n != nil; n = n.Parent {

2466

		if n.Type == ElementNode && n.DataAtom == a.Form {

2467

			p.form = n

2468

			break

2469

2470

2471

2472

	if err := p.parse(); err != nil {

2473

		return nil, err

2474

2475

2476

	parent := p.doc

2477

	if context != nil {

2478

		parent = root

2479

2480

2481

	var result []*Node

2482

	for c := parent.FirstChild; c != nil; {

2483

		next := c.NextSibling

2484

		parent.RemoveChild(c)

2485

		result = append(result, c)

2486

		c = next

2487

2488

	return result, nil

2489