6 files changed,
303 insertions(+),
15 deletions(-)
Author:
Oleksandr Smirnov
olexsmir@gmail.com
Committed at:
2026-04-28 14:40:04 +0300
Authored at:
2026-04-24 18:41:09 +0300
Change ID:
qxxqtrwwpuqvltkuwnvpmplqnytttqkx
Parent:
ecbe2c0
A
sources/telegram/page_title.go
··· 1 +package telegram 2 + 3 +import ( 4 + "context" 5 + "fmt" 6 + "io" 7 + "net/http" 8 + "strings" 9 + 10 + "github.com/PuerkitoBio/goquery" 11 + "golang.org/x/net/html/charset" 12 +) 13 + 14 +const maxPageBytes = 2 << 20 // 2 MiB 15 + 16 +func fetchPageTitle(ctx context.Context, get func(context.Context, string) (*http.Response, error), rawURL string) (string, error) { 17 + if get == nil { 18 + return "", fmt.Errorf("missing page getter") 19 + } 20 + 21 + resp, err := get(ctx, rawURL) 22 + if err != nil { 23 + return "", err 24 + } 25 + defer resp.Body.Close() 26 + 27 + if resp.StatusCode < http.StatusOK || resp.StatusCode >= http.StatusMultipleChoices { 28 + return "", fmt.Errorf("unexpected status code: %d", resp.StatusCode) 29 + } 30 + 31 + decoded, err := charset.NewReader(io.LimitReader(resp.Body, maxPageBytes), resp.Header.Get("Content-Type")) 32 + if err != nil { 33 + return "", err 34 + } 35 + 36 + doc, err := goquery.NewDocumentFromReader(decoded) 37 + if err != nil { 38 + return "", err 39 + } 40 + 41 + title := normalizePageTitle(doc.Find("title").First().Text()) 42 + if !isMeaningfulPageTitle(title) { 43 + title = metaPageTitle(doc) 44 + } 45 + if !isMeaningfulPageTitle(title) { 46 + return "", fmt.Errorf("page title is empty") 47 + } 48 + return title, nil 49 +} 50 + 51 +func metaPageTitle(doc *goquery.Document) string { 52 + selectors := []string{ 53 + `meta[property="og:title"]`, 54 + `meta[name="og:title"]`, 55 + `meta[property="twitter:title"]`, 56 + `meta[name="twitter:title"]`, 57 + `meta[itemprop="name"]`, 58 + } 59 + 60 + for _, selector := range selectors { 61 + content, ok := doc.Find(selector).First().Attr("content") 62 + if !ok { 63 + continue 64 + } 65 + title := normalizePageTitle(content) 66 + if isMeaningfulPageTitle(title) { 67 + return title 68 + } 69 + } 70 + return "" 71 +} 72 + 73 +func normalizePageTitle(raw string) string { 74 + return strings.Join(strings.Fields(raw), " ") 75 +} 76 + 77 +func isMeaningfulPageTitle(title string) bool { 78 + switch strings.ToLower(strings.TrimSpace(title)) { 79 + case "", "- youtube", "youtube": 80 + return false 81 + default: 82 + return true 83 + } 84 +}
A
sources/telegram/page_title_test.go
··· 1 +package telegram 2 + 3 +import ( 4 + "context" 5 + "io" 6 + "log/slog" 7 + "net/http" 8 + "strings" 9 + "testing" 10 + 11 + "olexsmir.xyz/x/is" 12 +) 13 + 14 +func TestEnrichMessageWithLinkTitlesStoresFetchedTitle(t *testing.T) { 15 + calls := 0 16 + tg := &telegram{ 17 + get: func(_ context.Context, url string) (*http.Response, error) { 18 + calls++ 19 + is.Equal(t, "https://example.com/post", url) 20 + return &http.Response{ 21 + StatusCode: http.StatusOK, 22 + Header: http.Header{ 23 + "Content-Type": []string{"text/html; charset=utf-8"}, 24 + }, 25 + Body: io.NopCloser(strings.NewReader(`<html><head><title> Example Post Title </title></head></html>`)), 26 + }, nil 27 + }, 28 + logger: slog.Default(), 29 + } 30 + msg := &Message{Text: "https://example.com/post"} 31 + 32 + changed := tg.enrichMessageWithLinkTitles(context.Background(), msg) 33 + is.Equal(t, true, changed) 34 + is.Equal(t, 1, calls) 35 + is.Equal(t, "Example Post Title", msg.LinkTitles["https://example.com/post"]) 36 + 37 + changed = tg.enrichMessageWithLinkTitles(context.Background(), msg) 38 + is.Equal(t, false, changed) 39 + is.Equal(t, 1, calls) 40 +} 41 + 42 +func TestEnrichMessageWithLinkTitlesRefreshesPlaceholderCachedTitle(t *testing.T) { 43 + calls := 0 44 + tg := &telegram{ 45 + get: func(_ context.Context, _ string) (*http.Response, error) { 46 + calls++ 47 + return &http.Response{ 48 + StatusCode: http.StatusOK, 49 + Header: http.Header{ 50 + "Content-Type": []string{"text/html; charset=utf-8"}, 51 + }, 52 + Body: io.NopCloser(strings.NewReader(`<html><head><title>Real Video Title</title></head></html>`)), 53 + }, nil 54 + }, 55 + logger: slog.Default(), 56 + } 57 + msg := &Message{ 58 + Text: "https://www.youtube.com/watch?v=dQw4w9WgXcQ", 59 + LinkTitles: map[string]string{ 60 + "https://www.youtube.com/watch?v=dQw4w9WgXcQ": " - YouTube ", 61 + }, 62 + } 63 + 64 + changed := tg.enrichMessageWithLinkTitles(context.Background(), msg) 65 + is.Equal(t, true, changed) 66 + is.Equal(t, 1, calls) 67 + is.Equal(t, "Real Video Title", msg.LinkTitles["https://www.youtube.com/watch?v=dQw4w9WgXcQ"]) 68 +} 69 + 70 +func TestIsSingleLinkMessage(t *testing.T) { 71 + is.Equal(t, true, isSingleLinkMessage(" https://example.com/path. ")) 72 + is.Equal(t, false, isSingleLinkMessage("check https://example.com/path")) 73 +} 74 + 75 +func TestEnrichMessageWithLinkTitlesIgnoresNonSingleLinkMessages(t *testing.T) { 76 + calls := 0 77 + tg := &telegram{ 78 + get: func(_ context.Context, _ string) (*http.Response, error) { 79 + calls++ 80 + return nil, nil 81 + }, 82 + logger: slog.Default(), 83 + } 84 + msg := &Message{Text: "check this https://example.com/post"} 85 + 86 + changed := tg.enrichMessageWithLinkTitles(context.Background(), msg) 87 + is.Equal(t, false, changed) 88 + is.Equal(t, 0, calls) 89 +} 90 + 91 +func TestFetchPageTitleFallsBackToMetaTitleForYouTubePlaceholder(t *testing.T) { 92 + title, err := fetchPageTitle(context.Background(), func(_ context.Context, _ string) (*http.Response, error) { 93 + return &http.Response{ 94 + StatusCode: http.StatusOK, 95 + Header: http.Header{ 96 + "Content-Type": []string{"text/html; charset=utf-8"}, 97 + }, 98 + Body: io.NopCloser(strings.NewReader(`<html><head><title> - YouTube </title><meta property="og:title" content="Real Video Title"></head></html>`)), 99 + }, nil 100 + }, "https://www.youtube.com/watch?v=dQw4w9WgXcQ") 101 + if err != nil { 102 + t.Fatalf("unexpected error: %v", err) 103 + } 104 + is.Equal(t, "Real Video Title", title) 105 +} 106 + 107 +func TestFetchPageTitleRejectsYouTubePlaceholderWithoutMetadata(t *testing.T) { 108 + _, err := fetchPageTitle(context.Background(), func(_ context.Context, _ string) (*http.Response, error) { 109 + return &http.Response{ 110 + StatusCode: http.StatusOK, 111 + Header: http.Header{ 112 + "Content-Type": []string{"text/html; charset=utf-8"}, 113 + }, 114 + Body: io.NopCloser(strings.NewReader(`<html><head><title> - YouTube </title></head></html>`)), 115 + }, nil 116 + }, "https://www.youtube.com/watch?v=dQw4w9WgXcQ") 117 + if err == nil { 118 + t.Fatalf("expected an error for placeholder title") 119 + } 120 +}
M
sources/telegram/sdk.go
··· 52 52 } 53 53 54 54 type Message struct { 55 - MessageID int64 `json:"message_id"` 56 - From *User `json:"from"` 57 - Chat *Chat `json:"chat"` 58 - Text string `json:"text"` 59 - Caption string `json:"caption,omitempty"` 60 - Date int64 `json:"date"` 61 - Photo []PhotoSize `json:"photo,omitempty"` 62 - PhotoBase64 string `json:"photo_base64,omitempty"` 63 - PhotoMIMEType string `json:"photo_mime_type,omitempty"` 55 + MessageID int64 `json:"message_id"` 56 + From *User `json:"from"` 57 + Chat *Chat `json:"chat"` 58 + Text string `json:"text"` 59 + Caption string `json:"caption,omitempty"` 60 + Date int64 `json:"date"` 61 + Photo []PhotoSize `json:"photo,omitempty"` 62 + PhotoBase64 string `json:"photo_base64,omitempty"` 63 + PhotoMIMEType string `json:"photo_mime_type,omitempty"` 64 + LinkTitles map[string]string `json:"-"` 64 65 } 65 66 66 67 type PhotoSize struct {
M
sources/telegram/telegram.go
··· 18 18 db *app.Bucket 19 19 messages *app.Bucket 20 20 client *http.Client 21 + get func(context.Context, string) (*http.Response, error) 21 22 tg *TelegramSDK 22 23 allowedID int64 23 - logger *slog.Logger 24 + logger *slog.Logger 24 25 } 25 26 26 27 func Register(a *app.App) error { ··· 38 39 db: db, 39 40 messages: messages, 40 41 client: a.Client, 42 + get: a.Get, 41 43 tg: NewSDK(a.Client, a.Config.TGToken), 42 44 allowedID: a.Config.TGUserID, 43 - logger: a.Logger, 45 + logger: a.Logger, 44 46 } 45 47 46 48 a.AddWorker(t.worker) ··· 52 54 // todo: cache feed contruction 53 55 // todo: dont include messages older than N days 54 56 55 - messages, err := t.loadMessages() 57 + messages, err := t.loadMessages(r.Context()) 56 58 if err != nil { 57 59 http.Error(w, "failed to load messages", http.StatusInternalServerError) 58 60 return ··· 60 62 61 63 feed := app.NewFeed("Telegram feed", "telegram-feed") 62 64 for _, m := range messages { 65 + if changed := t.enrichMessageWithLinkTitles(r.Context(), m); changed { 66 + if err := t.saveMessage(m); err != nil { 67 + http.Error(w, "failed to update cached titles", http.StatusInternalServerError) 68 + return 69 + } 70 + } 63 71 feed.Add(feedEntryFromMessage(m)) 64 72 } 65 73 ··· 70 78 } 71 79 72 80 func (t *telegram) worker(ctx context.Context) error { 81 + t.logger.Info("starting telegram bot") 82 + 73 83 offset, err := t.loadOffset() 74 84 if err != nil { 75 85 return err ··· 96 106 offset = u.UpdateID + 1 97 107 continue 98 108 } 109 + 110 + _ = t.enrichMessageWithLinkTitles(ctx, u.Message) 99 111 100 112 if err := t.saveMessage(u.Message); err != nil { 101 113 t.logger.ErrorContext(ctx, "failed to save message", "err", err) ··· 141 153 return t.messages.Set(key, buf.Bytes()) 142 154 } 143 155 144 -func (t *telegram) loadMessages() ([]*Message, error) { 156 +func (t *telegram) loadMessages(ctx context.Context) ([]*Message, error) { 145 157 var messages []*Message 146 158 err := t.messages.ForEach(func(k, v []byte) error { 147 159 var m Message 148 160 if err := gob.NewDecoder(bytes.NewReader(v)).Decode(&m); err != nil { 149 - return err 161 + t.logger.WarnContext(ctx, "failed to decode telegram message, skipping", "key", fmt.Sprintf("%x", k), "err", err) 162 + return nil 150 163 } 151 164 messages = append(messages, &m) 152 165 return nil ··· 154 167 return messages, err 155 168 } 156 169 170 +func (t *telegram) enrichMessageWithLinkTitles(ctx context.Context, m *Message) bool { 171 + text := messageText(m) 172 + if !isSingleLinkMessage(text) { 173 + return false 174 + } 175 + 176 + links := normalizeLinks(messageLinks(text)) 177 + if len(links) == 0 { 178 + return false 179 + } 180 + if m.LinkTitles == nil { 181 + m.LinkTitles = make(map[string]string, len(links)) 182 + } 183 + 184 + changed := false 185 + for _, link := range links { 186 + cachedTitle := normalizePageTitle(m.LinkTitles[link]) 187 + if isMeaningfulPageTitle(cachedTitle) { 188 + continue 189 + } 190 + if cachedTitle != "" { 191 + delete(m.LinkTitles, link) 192 + changed = true 193 + } 194 + title, err := fetchPageTitle(ctx, t.get, link) 195 + if err != nil { 196 + t.logger.WarnContext(ctx, "failed to lookup page title", "url", link, "err", err) 197 + continue 198 + } 199 + m.LinkTitles[link] = title 200 + changed = true 201 + } 202 + return changed 203 +} 204 + 157 205 func feedEntryFromMessage(m *Message) app.FeedEntry { 158 206 updated := time.Unix(m.Date, 0) 159 207 text := messageText(m) ··· 165 213 166 214 if m.PhotoBase64 == "" { 167 215 title := text 216 + if isSingleLinkMessage(text) { 217 + for _, link := range normalizedLinks { 218 + if t := strings.TrimSpace(m.LinkTitles[link]); t != "" { 219 + title = t 220 + break 221 + } 222 + } 223 + } 168 224 if len(title) > 64 { 169 225 title = title[:64] + "..." 170 226 } ··· 205 261 ContentType: "html", 206 262 Updated: updated, 207 263 } 264 +} 265 + 266 +func isSingleLinkMessage(text string) bool { 267 + links := findLinks(text) 268 + if len(links) != 1 { 269 + return false 270 + } 271 + link := links[0] 272 + if strings.TrimSpace(text[:link.start]) != "" { 273 + return false 274 + } 275 + after := strings.TrimSpace(text[link.end:]) 276 + return trailingPunctRe.ReplaceAllString(after, "") == "" 208 277 } 209 278 210 279 func messageText(m *Message) string {
M
sources/telegram/telegram_test.go
··· 62 62 is.Equal(t, "https://www.youtube.com/watch?v=dQw4w9WgXcQ", entry.Links[1].Href) 63 63 is.Equal(t, "yt:video:dQw4w9WgXcQ", entry.ID) 64 64 } 65 + 66 +func TestFeedEntryFromMessageUsesStoredLinkTitleForSingleLink(t *testing.T) { 67 + msg := &Message{ 68 + MessageID: 16, 69 + Text: "https://example.com/post", 70 + Date: time.Date(2026, 4, 23, 11, 0, 0, 0, time.UTC).Unix(), 71 + LinkTitles: map[string]string{ 72 + "https://example.com/post": "Example Post Title", 73 + }, 74 + } 75 + 76 + entry := feedEntryFromMessage(msg) 77 + is.Equal(t, "Example Post Title", entry.Title) 78 +}