all repos

rss-tools @ 71f9578bfe2969b6b22984c4264c8bf6c067e608

get rss feed from sources that(i need and) dont provide one

rss-tools/sources/telegram/page_title.go (view raw)

Oleksandr Smirnov Oleksandr Smirnov
olexsmir@gmail.com
telegram: fetch links title, 1 month ago
1
package telegram
2
3
import (
4
	"context"
5
	"fmt"
6
	"io"
7
	"net/http"
8
	"strings"
9
10
	"github.com/PuerkitoBio/goquery"
11
	"golang.org/x/net/html/charset"
12
)
13
14
const maxPageBytes = 2 << 20 // 2 MiB
15
16
func fetchPageTitle(ctx context.Context, get func(context.Context, string) (*http.Response, error), rawURL string) (string, error) {
17
	if get == nil {
18
		return "", fmt.Errorf("missing page getter")
19
	}
20
21
	resp, err := get(ctx, rawURL)
22
	if err != nil {
23
		return "", err
24
	}
25
	defer resp.Body.Close()
26
27
	if resp.StatusCode < http.StatusOK || resp.StatusCode >= http.StatusMultipleChoices {
28
		return "", fmt.Errorf("unexpected status code: %d", resp.StatusCode)
29
	}
30
31
	decoded, err := charset.NewReader(io.LimitReader(resp.Body, maxPageBytes), resp.Header.Get("Content-Type"))
32
	if err != nil {
33
		return "", err
34
	}
35
36
	doc, err := goquery.NewDocumentFromReader(decoded)
37
	if err != nil {
38
		return "", err
39
	}
40
41
	title := normalizePageTitle(doc.Find("title").First().Text())
42
	if !isMeaningfulPageTitle(title) {
43
		title = metaPageTitle(doc)
44
	}
45
	if !isMeaningfulPageTitle(title) {
46
		return "", fmt.Errorf("page title is empty")
47
	}
48
	return title, nil
49
}
50
51
func metaPageTitle(doc *goquery.Document) string {
52
	selectors := []string{
53
		`meta[property="og:title"]`,
54
		`meta[name="og:title"]`,
55
		`meta[property="twitter:title"]`,
56
		`meta[name="twitter:title"]`,
57
		`meta[itemprop="name"]`,
58
	}
59
60
	for _, selector := range selectors {
61
		content, ok := doc.Find(selector).First().Attr("content")
62
		if !ok {
63
			continue
64
		}
65
		title := normalizePageTitle(content)
66
		if isMeaningfulPageTitle(title) {
67
			return title
68
		}
69
	}
70
	return ""
71
}
72
73
func normalizePageTitle(raw string) string {
74
	return strings.Join(strings.Fields(raw), " ")
75
}
76
77
func isMeaningfulPageTitle(title string) bool {
78
	switch strings.ToLower(strings.TrimSpace(title)) {
79
	case "", "- youtube", "youtube":
80
		return false
81
	default:
82
		return true
83
	}
84
}