all repos

rss-tools @ 70f8cb68d68020ffbcd4b0b5d602cefaa36e5a2a

get rss feed from sources that(i need and) dont provide one

rss-tools/sources/telegram/page_title.go (view raw)

Oleksandr Smirnov Oleksandr Smirnov
olexsmir@gmail.com
youtube: fix title fetcher, 10 days ago
1
package telegram
2
3
import (
4
	"context"
5
	"encoding/json"
6
	"fmt"
7
	"io"
8
	"net/http"
9
	"net/url"
10
	"strings"
11
12
	"github.com/PuerkitoBio/goquery"
13
	"golang.org/x/net/html/charset"
14
)
15
16
const maxPageBytes = 2 << 20 // 2 MiB
17
18
func fetchPageTitle(ctx context.Context, get func(context.Context, string) (*http.Response, error), rawURL string) (string, error) {
19
	if get == nil {
20
		return "", fmt.Errorf("missing page getter")
21
	}
22
23
	resp, err := get(ctx, rawURL)
24
	if err != nil {
25
		return "", err
26
	}
27
	defer resp.Body.Close()
28
29
	if resp.StatusCode < http.StatusOK || resp.StatusCode >= http.StatusMultipleChoices {
30
		return "", fmt.Errorf("unexpected status code: %d", resp.StatusCode)
31
	}
32
33
	decoded, err := charset.NewReader(io.LimitReader(resp.Body, maxPageBytes), resp.Header.Get("Content-Type"))
34
	if err != nil {
35
		return "", err
36
	}
37
38
	doc, err := goquery.NewDocumentFromReader(decoded)
39
	if err != nil {
40
		return "", err
41
	}
42
43
	title := normalizePageTitle(doc.Find("title").First().Text())
44
	if !isMeaningfulPageTitle(title) {
45
		title = metaPageTitle(doc)
46
	}
47
	if !isMeaningfulPageTitle(title) {
48
		if videoID, _, ok := youtubeCanonicalLink(rawURL); ok {
49
			ytTitle, ytErr := fetchYouTubeVideoTitle(ctx, get, videoID)
50
			if ytErr == nil {
51
				return ytTitle, nil
52
			}
53
		}
54
		return "", fmt.Errorf("page title is empty")
55
	}
56
	return title, nil
57
}
58
59
func metaPageTitle(doc *goquery.Document) string {
60
	selectors := []string{
61
		`meta[property="og:title"]`,
62
		`meta[name="og:title"]`,
63
		`meta[property="twitter:title"]`,
64
		`meta[name="twitter:title"]`,
65
		`meta[itemprop="name"]`,
66
	}
67
68
	for _, selector := range selectors {
69
		content, ok := doc.Find(selector).First().Attr("content")
70
		if !ok {
71
			continue
72
		}
73
		title := normalizePageTitle(content)
74
		if isMeaningfulPageTitle(title) {
75
			return title
76
		}
77
	}
78
	return ""
79
}
80
81
type youtubeOEmbedResponse struct {
82
	Title string `json:"title"`
83
}
84
85
func fetchYouTubeVideoTitle(ctx context.Context, get func(context.Context, string) (*http.Response, error), videoID string) (string, error) {
86
	u := url.URL{
87
		Scheme: "https",
88
		Host:   "www.youtube.com",
89
		Path:   "/oembed",
90
	}
91
	q := u.Query()
92
	q.Set("url", "https://www.youtube.com/watch?v="+videoID)
93
	q.Set("format", "json")
94
	u.RawQuery = q.Encode()
95
96
	resp, err := get(ctx, u.String())
97
	if err != nil {
98
		return "", err
99
	}
100
	defer resp.Body.Close()
101
102
	if resp.StatusCode != http.StatusOK {
103
		return "", fmt.Errorf("oEmbed request failed with status %d", resp.StatusCode)
104
	}
105
106
	var oembed youtubeOEmbedResponse
107
	if err := json.NewDecoder(resp.Body).Decode(&oembed); err != nil {
108
		return "", err
109
	}
110
111
	title := normalizePageTitle(oembed.Title)
112
	if !isMeaningfulPageTitle(title) {
113
		return "", fmt.Errorf("oEmbed returned empty title")
114
	}
115
116
	return title, nil
117
}
118
119
func normalizePageTitle(raw string) string {
120
	return strings.Join(strings.Fields(raw), " ")
121
}
122
123
func isMeaningfulPageTitle(title string) bool {
124
	switch strings.ToLower(strings.TrimSpace(title)) {
125
	case "", "- youtube", "youtube":
126
		return false
127
	default:
128
		return true
129
	}
130
}