all repos

rss-tools @ 1e9eff13ce7cd78efaaabbd1a9f98c5b0c30362b

get rss feed from sources that(i need and) dont provide one
6 files changed, 303 insertions(+), 15 deletions(-)
telegram: fetch links title
Author: Oleksandr Smirnov olexsmir@gmail.com
Committed at: 2026-04-28 14:40:04 +0300
Authored at: 2026-04-24 18:41:09 +0300
Change ID: qxxqtrwwpuqvltkuwnvpmplqnytttqkx
Parent: ecbe2c0
M app/app.go
···
        57
        57
         }

      
        58
        58
         

      
        59
        59
         const (

      
        60
        
        -	defaultScraperUserAgent = "rss-tools/1.0)"

      
        
        60
        +	defaultScraperUserAgent = "rss-tools/1.0" // todo: i wanna be a chrome

      
        61
        61
         	defaultScraperAccept    = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"

      
        62
        62
         )

      
        63
        63
         

      
A sources/telegram/page_title.go
···
        
        1
        +package telegram

      
        
        2
        +

      
        
        3
        +import (

      
        
        4
        +	"context"

      
        
        5
        +	"fmt"

      
        
        6
        +	"io"

      
        
        7
        +	"net/http"

      
        
        8
        +	"strings"

      
        
        9
        +

      
        
        10
        +	"github.com/PuerkitoBio/goquery"

      
        
        11
        +	"golang.org/x/net/html/charset"

      
        
        12
        +)

      
        
        13
        +

      
        
        14
        +const maxPageBytes = 2 << 20 // 2 MiB

      
        
        15
        +

      
        
        16
        +func fetchPageTitle(ctx context.Context, get func(context.Context, string) (*http.Response, error), rawURL string) (string, error) {

      
        
        17
        +	if get == nil {

      
        
        18
        +		return "", fmt.Errorf("missing page getter")

      
        
        19
        +	}

      
        
        20
        +

      
        
        21
        +	resp, err := get(ctx, rawURL)

      
        
        22
        +	if err != nil {

      
        
        23
        +		return "", err

      
        
        24
        +	}

      
        
        25
        +	defer resp.Body.Close()

      
        
        26
        +

      
        
        27
        +	if resp.StatusCode < http.StatusOK || resp.StatusCode >= http.StatusMultipleChoices {

      
        
        28
        +		return "", fmt.Errorf("unexpected status code: %d", resp.StatusCode)

      
        
        29
        +	}

      
        
        30
        +

      
        
        31
        +	decoded, err := charset.NewReader(io.LimitReader(resp.Body, maxPageBytes), resp.Header.Get("Content-Type"))

      
        
        32
        +	if err != nil {

      
        
        33
        +		return "", err

      
        
        34
        +	}

      
        
        35
        +

      
        
        36
        +	doc, err := goquery.NewDocumentFromReader(decoded)

      
        
        37
        +	if err != nil {

      
        
        38
        +		return "", err

      
        
        39
        +	}

      
        
        40
        +

      
        
        41
        +	title := normalizePageTitle(doc.Find("title").First().Text())

      
        
        42
        +	if !isMeaningfulPageTitle(title) {

      
        
        43
        +		title = metaPageTitle(doc)

      
        
        44
        +	}

      
        
        45
        +	if !isMeaningfulPageTitle(title) {

      
        
        46
        +		return "", fmt.Errorf("page title is empty")

      
        
        47
        +	}

      
        
        48
        +	return title, nil

      
        
        49
        +}

      
        
        50
        +

      
        
        51
        +func metaPageTitle(doc *goquery.Document) string {

      
        
        52
        +	selectors := []string{

      
        
        53
        +		`meta[property="og:title"]`,

      
        
        54
        +		`meta[name="og:title"]`,

      
        
        55
        +		`meta[property="twitter:title"]`,

      
        
        56
        +		`meta[name="twitter:title"]`,

      
        
        57
        +		`meta[itemprop="name"]`,

      
        
        58
        +	}

      
        
        59
        +

      
        
        60
        +	for _, selector := range selectors {

      
        
        61
        +		content, ok := doc.Find(selector).First().Attr("content")

      
        
        62
        +		if !ok {

      
        
        63
        +			continue

      
        
        64
        +		}

      
        
        65
        +		title := normalizePageTitle(content)

      
        
        66
        +		if isMeaningfulPageTitle(title) {

      
        
        67
        +			return title

      
        
        68
        +		}

      
        
        69
        +	}

      
        
        70
        +	return ""

      
        
        71
        +}

      
        
        72
        +

      
        
        73
        +func normalizePageTitle(raw string) string {

      
        
        74
        +	return strings.Join(strings.Fields(raw), " ")

      
        
        75
        +}

      
        
        76
        +

      
        
        77
        +func isMeaningfulPageTitle(title string) bool {

      
        
        78
        +	switch strings.ToLower(strings.TrimSpace(title)) {

      
        
        79
        +	case "", "- youtube", "youtube":

      
        
        80
        +		return false

      
        
        81
        +	default:

      
        
        82
        +		return true

      
        
        83
        +	}

      
        
        84
        +}

      
A sources/telegram/page_title_test.go
···
        
        1
        +package telegram

      
        
        2
        +

      
        
        3
        +import (

      
        
        4
        +	"context"

      
        
        5
        +	"io"

      
        
        6
        +	"log/slog"

      
        
        7
        +	"net/http"

      
        
        8
        +	"strings"

      
        
        9
        +	"testing"

      
        
        10
        +

      
        
        11
        +	"olexsmir.xyz/x/is"

      
        
        12
        +)

      
        
        13
        +

      
        
        14
        +func TestEnrichMessageWithLinkTitlesStoresFetchedTitle(t *testing.T) {

      
        
        15
        +	calls := 0

      
        
        16
        +	tg := &telegram{

      
        
        17
        +		get: func(_ context.Context, url string) (*http.Response, error) {

      
        
        18
        +			calls++

      
        
        19
        +			is.Equal(t, "https://example.com/post", url)

      
        
        20
        +			return &http.Response{

      
        
        21
        +				StatusCode: http.StatusOK,

      
        
        22
        +				Header: http.Header{

      
        
        23
        +					"Content-Type": []string{"text/html; charset=utf-8"},

      
        
        24
        +				},

      
        
        25
        +				Body: io.NopCloser(strings.NewReader(`<html><head><title> Example Post Title </title></head></html>`)),

      
        
        26
        +			}, nil

      
        
        27
        +		},

      
        
        28
        +		logger: slog.Default(),

      
        
        29
        +	}

      
        
        30
        +	msg := &Message{Text: "https://example.com/post"}

      
        
        31
        +

      
        
        32
        +	changed := tg.enrichMessageWithLinkTitles(context.Background(), msg)

      
        
        33
        +	is.Equal(t, true, changed)

      
        
        34
        +	is.Equal(t, 1, calls)

      
        
        35
        +	is.Equal(t, "Example Post Title", msg.LinkTitles["https://example.com/post"])

      
        
        36
        +

      
        
        37
        +	changed = tg.enrichMessageWithLinkTitles(context.Background(), msg)

      
        
        38
        +	is.Equal(t, false, changed)

      
        
        39
        +	is.Equal(t, 1, calls)

      
        
        40
        +}

      
        
        41
        +

      
        
        42
        +func TestEnrichMessageWithLinkTitlesRefreshesPlaceholderCachedTitle(t *testing.T) {

      
        
        43
        +	calls := 0

      
        
        44
        +	tg := &telegram{

      
        
        45
        +		get: func(_ context.Context, _ string) (*http.Response, error) {

      
        
        46
        +			calls++

      
        
        47
        +			return &http.Response{

      
        
        48
        +				StatusCode: http.StatusOK,

      
        
        49
        +				Header: http.Header{

      
        
        50
        +					"Content-Type": []string{"text/html; charset=utf-8"},

      
        
        51
        +				},

      
        
        52
        +				Body: io.NopCloser(strings.NewReader(`<html><head><title>Real Video Title</title></head></html>`)),

      
        
        53
        +			}, nil

      
        
        54
        +		},

      
        
        55
        +		logger: slog.Default(),

      
        
        56
        +	}

      
        
        57
        +	msg := &Message{

      
        
        58
        +		Text: "https://www.youtube.com/watch?v=dQw4w9WgXcQ",

      
        
        59
        +		LinkTitles: map[string]string{

      
        
        60
        +			"https://www.youtube.com/watch?v=dQw4w9WgXcQ": " - YouTube ",

      
        
        61
        +		},

      
        
        62
        +	}

      
        
        63
        +

      
        
        64
        +	changed := tg.enrichMessageWithLinkTitles(context.Background(), msg)

      
        
        65
        +	is.Equal(t, true, changed)

      
        
        66
        +	is.Equal(t, 1, calls)

      
        
        67
        +	is.Equal(t, "Real Video Title", msg.LinkTitles["https://www.youtube.com/watch?v=dQw4w9WgXcQ"])

      
        
        68
        +}

      
        
        69
        +

      
        
        70
        +func TestIsSingleLinkMessage(t *testing.T) {

      
        
        71
        +	is.Equal(t, true, isSingleLinkMessage(" https://example.com/path. "))

      
        
        72
        +	is.Equal(t, false, isSingleLinkMessage("check https://example.com/path"))

      
        
        73
        +}

      
        
        74
        +

      
        
        75
        +func TestEnrichMessageWithLinkTitlesIgnoresNonSingleLinkMessages(t *testing.T) {

      
        
        76
        +	calls := 0

      
        
        77
        +	tg := &telegram{

      
        
        78
        +		get: func(_ context.Context, _ string) (*http.Response, error) {

      
        
        79
        +			calls++

      
        
        80
        +			return nil, nil

      
        
        81
        +		},

      
        
        82
        +		logger: slog.Default(),

      
        
        83
        +	}

      
        
        84
        +	msg := &Message{Text: "check this https://example.com/post"}

      
        
        85
        +

      
        
        86
        +	changed := tg.enrichMessageWithLinkTitles(context.Background(), msg)

      
        
        87
        +	is.Equal(t, false, changed)

      
        
        88
        +	is.Equal(t, 0, calls)

      
        
        89
        +}

      
        
        90
        +

      
        
        91
        +func TestFetchPageTitleFallsBackToMetaTitleForYouTubePlaceholder(t *testing.T) {

      
        
        92
        +	title, err := fetchPageTitle(context.Background(), func(_ context.Context, _ string) (*http.Response, error) {

      
        
        93
        +		return &http.Response{

      
        
        94
        +			StatusCode: http.StatusOK,

      
        
        95
        +			Header: http.Header{

      
        
        96
        +				"Content-Type": []string{"text/html; charset=utf-8"},

      
        
        97
        +			},

      
        
        98
        +			Body: io.NopCloser(strings.NewReader(`<html><head><title> - YouTube </title><meta property="og:title" content="Real Video Title"></head></html>`)),

      
        
        99
        +		}, nil

      
        
        100
        +	}, "https://www.youtube.com/watch?v=dQw4w9WgXcQ")

      
        
        101
        +	if err != nil {

      
        
        102
        +		t.Fatalf("unexpected error: %v", err)

      
        
        103
        +	}

      
        
        104
        +	is.Equal(t, "Real Video Title", title)

      
        
        105
        +}

      
        
        106
        +

      
        
        107
        +func TestFetchPageTitleRejectsYouTubePlaceholderWithoutMetadata(t *testing.T) {

      
        
        108
        +	_, err := fetchPageTitle(context.Background(), func(_ context.Context, _ string) (*http.Response, error) {

      
        
        109
        +		return &http.Response{

      
        
        110
        +			StatusCode: http.StatusOK,

      
        
        111
        +			Header: http.Header{

      
        
        112
        +				"Content-Type": []string{"text/html; charset=utf-8"},

      
        
        113
        +			},

      
        
        114
        +			Body: io.NopCloser(strings.NewReader(`<html><head><title> - YouTube </title></head></html>`)),

      
        
        115
        +		}, nil

      
        
        116
        +	}, "https://www.youtube.com/watch?v=dQw4w9WgXcQ")

      
        
        117
        +	if err == nil {

      
        
        118
        +		t.Fatalf("expected an error for placeholder title")

      
        
        119
        +	}

      
        
        120
        +}

      
M sources/telegram/sdk.go
···
        52
        52
         }

      
        53
        53
         

      
        54
        54
         type Message struct {

      
        55
        
        -	MessageID     int64       `json:"message_id"`

      
        56
        
        -	From          *User       `json:"from"`

      
        57
        
        -	Chat          *Chat       `json:"chat"`

      
        58
        
        -	Text          string      `json:"text"`

      
        59
        
        -	Caption       string      `json:"caption,omitempty"`

      
        60
        
        -	Date          int64       `json:"date"`

      
        61
        
        -	Photo         []PhotoSize `json:"photo,omitempty"`

      
        62
        
        -	PhotoBase64   string      `json:"photo_base64,omitempty"`

      
        63
        
        -	PhotoMIMEType string      `json:"photo_mime_type,omitempty"`

      
        
        55
        +	MessageID     int64             `json:"message_id"`

      
        
        56
        +	From          *User             `json:"from"`

      
        
        57
        +	Chat          *Chat             `json:"chat"`

      
        
        58
        +	Text          string            `json:"text"`

      
        
        59
        +	Caption       string            `json:"caption,omitempty"`

      
        
        60
        +	Date          int64             `json:"date"`

      
        
        61
        +	Photo         []PhotoSize       `json:"photo,omitempty"`

      
        
        62
        +	PhotoBase64   string            `json:"photo_base64,omitempty"`

      
        
        63
        +	PhotoMIMEType string            `json:"photo_mime_type,omitempty"`

      
        
        64
        +	LinkTitles    map[string]string `json:"-"`

      
        64
        65
         }

      
        65
        66
         

      
        66
        67
         type PhotoSize struct {

      
M sources/telegram/telegram.go
···
        18
        18
         	db        *app.Bucket

      
        19
        19
         	messages  *app.Bucket

      
        20
        20
         	client    *http.Client

      
        
        21
        +	get       func(context.Context, string) (*http.Response, error)

      
        21
        22
         	tg        *TelegramSDK

      
        22
        23
         	allowedID int64

      
        23
        
        -	logger *slog.Logger

      
        
        24
        +	logger    *slog.Logger

      
        24
        25
         }

      
        25
        26
         

      
        26
        27
         func Register(a *app.App) error {

      ···
        38
        39
         		db:        db,

      
        39
        40
         		messages:  messages,

      
        40
        41
         		client:    a.Client,

      
        
        42
        +		get:       a.Get,

      
        41
        43
         		tg:        NewSDK(a.Client, a.Config.TGToken),

      
        42
        44
         		allowedID: a.Config.TGUserID,

      
        43
        
        -		logger: a.Logger,

      
        
        45
        +		logger:    a.Logger,

      
        44
        46
         	}

      
        45
        47
         

      
        46
        48
         	a.AddWorker(t.worker)

      ···
        52
        54
         	// todo: cache feed contruction

      
        53
        55
         	// todo: dont include messages older than N days

      
        54
        56
         

      
        55
        
        -	messages, err := t.loadMessages()

      
        
        57
        +	messages, err := t.loadMessages(r.Context())

      
        56
        58
         	if err != nil {

      
        57
        59
         		http.Error(w, "failed to load messages", http.StatusInternalServerError)

      
        58
        60
         		return

      ···
        60
        62
         

      
        61
        63
         	feed := app.NewFeed("Telegram feed", "telegram-feed")

      
        62
        64
         	for _, m := range messages {

      
        
        65
        +		if changed := t.enrichMessageWithLinkTitles(r.Context(), m); changed {

      
        
        66
        +			if err := t.saveMessage(m); err != nil {

      
        
        67
        +				http.Error(w, "failed to update cached titles", http.StatusInternalServerError)

      
        
        68
        +				return

      
        
        69
        +			}

      
        
        70
        +		}

      
        63
        71
         		feed.Add(feedEntryFromMessage(m))

      
        64
        72
         	}

      
        65
        73
         

      ···
        70
        78
         }

      
        71
        79
         

      
        72
        80
         func (t *telegram) worker(ctx context.Context) error {

      
        
        81
        +	t.logger.Info("starting telegram bot")

      
        
        82
        +

      
        73
        83
         	offset, err := t.loadOffset()

      
        74
        84
         	if err != nil {

      
        75
        85
         		return err

      ···
        96
        106
         				offset = u.UpdateID + 1

      
        97
        107
         				continue

      
        98
        108
         			}

      
        
        109
        +

      
        
        110
        +			_ = t.enrichMessageWithLinkTitles(ctx, u.Message)

      
        99
        111
         

      
        100
        112
         			if err := t.saveMessage(u.Message); err != nil {

      
        101
        113
         				t.logger.ErrorContext(ctx, "failed to save message", "err", err)

      ···
        141
        153
         	return t.messages.Set(key, buf.Bytes())

      
        142
        154
         }

      
        143
        155
         

      
        144
        
        -func (t *telegram) loadMessages() ([]*Message, error) {

      
        
        156
        +func (t *telegram) loadMessages(ctx context.Context) ([]*Message, error) {

      
        145
        157
         	var messages []*Message

      
        146
        158
         	err := t.messages.ForEach(func(k, v []byte) error {

      
        147
        159
         		var m Message

      
        148
        160
         		if err := gob.NewDecoder(bytes.NewReader(v)).Decode(&m); err != nil {

      
        149
        
        -			return err

      
        
        161
        +			t.logger.WarnContext(ctx, "failed to decode telegram message, skipping", "key", fmt.Sprintf("%x", k), "err", err)

      
        
        162
        +			return nil

      
        150
        163
         		}

      
        151
        164
         		messages = append(messages, &m)

      
        152
        165
         		return nil

      ···
        154
        167
         	return messages, err

      
        155
        168
         }

      
        156
        169
         

      
        
        170
        +func (t *telegram) enrichMessageWithLinkTitles(ctx context.Context, m *Message) bool {

      
        
        171
        +	text := messageText(m)

      
        
        172
        +	if !isSingleLinkMessage(text) {

      
        
        173
        +		return false

      
        
        174
        +	}

      
        
        175
        +

      
        
        176
        +	links := normalizeLinks(messageLinks(text))

      
        
        177
        +	if len(links) == 0 {

      
        
        178
        +		return false

      
        
        179
        +	}

      
        
        180
        +	if m.LinkTitles == nil {

      
        
        181
        +		m.LinkTitles = make(map[string]string, len(links))

      
        
        182
        +	}

      
        
        183
        +

      
        
        184
        +	changed := false

      
        
        185
        +	for _, link := range links {

      
        
        186
        +		cachedTitle := normalizePageTitle(m.LinkTitles[link])

      
        
        187
        +		if isMeaningfulPageTitle(cachedTitle) {

      
        
        188
        +			continue

      
        
        189
        +		}

      
        
        190
        +		if cachedTitle != "" {

      
        
        191
        +			delete(m.LinkTitles, link)

      
        
        192
        +			changed = true

      
        
        193
        +		}

      
        
        194
        +		title, err := fetchPageTitle(ctx, t.get, link)

      
        
        195
        +		if err != nil {

      
        
        196
        +			t.logger.WarnContext(ctx, "failed to lookup page title", "url", link, "err", err)

      
        
        197
        +			continue

      
        
        198
        +		}

      
        
        199
        +		m.LinkTitles[link] = title

      
        
        200
        +		changed = true

      
        
        201
        +	}

      
        
        202
        +	return changed

      
        
        203
        +}

      
        
        204
        +

      
        157
        205
         func feedEntryFromMessage(m *Message) app.FeedEntry {

      
        158
        206
         	updated := time.Unix(m.Date, 0)

      
        159
        207
         	text := messageText(m)

      ···
        165
        213
         

      
        166
        214
         	if m.PhotoBase64 == "" {

      
        167
        215
         		title := text

      
        
        216
        +		if isSingleLinkMessage(text) {

      
        
        217
        +			for _, link := range normalizedLinks {

      
        
        218
        +				if t := strings.TrimSpace(m.LinkTitles[link]); t != "" {

      
        
        219
        +					title = t

      
        
        220
        +					break

      
        
        221
        +				}

      
        
        222
        +			}

      
        
        223
        +		}

      
        168
        224
         		if len(title) > 64 {

      
        169
        225
         			title = title[:64] + "..."

      
        170
        226
         		}

      ···
        205
        261
         		ContentType: "html",

      
        206
        262
         		Updated:     updated,

      
        207
        263
         	}

      
        
        264
        +}

      
        
        265
        +

      
        
        266
        +func isSingleLinkMessage(text string) bool {

      
        
        267
        +	links := findLinks(text)

      
        
        268
        +	if len(links) != 1 {

      
        
        269
        +		return false

      
        
        270
        +	}

      
        
        271
        +	link := links[0]

      
        
        272
        +	if strings.TrimSpace(text[:link.start]) != "" {

      
        
        273
        +		return false

      
        
        274
        +	}

      
        
        275
        +	after := strings.TrimSpace(text[link.end:])

      
        
        276
        +	return trailingPunctRe.ReplaceAllString(after, "") == ""

      
        208
        277
         }

      
        209
        278
         

      
        210
        279
         func messageText(m *Message) string {

      
M sources/telegram/telegram_test.go
···
        62
        62
         	is.Equal(t, "https://www.youtube.com/watch?v=dQw4w9WgXcQ", entry.Links[1].Href)

      
        63
        63
         	is.Equal(t, "yt:video:dQw4w9WgXcQ", entry.ID)

      
        64
        64
         }

      
        
        65
        +

      
        
        66
        +func TestFeedEntryFromMessageUsesStoredLinkTitleForSingleLink(t *testing.T) {

      
        
        67
        +	msg := &Message{

      
        
        68
        +		MessageID: 16,

      
        
        69
        +		Text:      "https://example.com/post",

      
        
        70
        +		Date:      time.Date(2026, 4, 23, 11, 0, 0, 0, time.UTC).Unix(),

      
        
        71
        +		LinkTitles: map[string]string{

      
        
        72
        +			"https://example.com/post": "Example Post Title",

      
        
        73
        +		},

      
        
        74
        +	}

      
        
        75
        +

      
        
        76
        +	entry := feedEntryFromMessage(msg)

      
        
        77
        +	is.Equal(t, "Example Post Title", entry.Title)

      
        
        78
        +}