all repos

anpi @ 2122b1be1ed91cb8a550a8dc7769f20305662943

yaml to anki importer

anpi/cambridge.go (view raw)

Olexandr Smirnov Olexandr Smirnov
olexsmir@gmail.com
repurpose of the tool, 8 days ago
1
package main
2
3
import (
4
	"fmt"
5
	"io"
6
	"net/http"
7
	"slices"
8
	"strings"
9
10
	"golang.org/x/net/html"
11
)
12
13
const (
14
	cambridgeURL = "https://dictionary.cambridge.org/dictionary/english/"
15
	userAgent    = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
16
)
17
18
type CambridgeDictionary struct {
19
	baseURL string
20
	ua      string
21
}
22
23
func NewCambridgeDictionary() *CambridgeDictionary {
24
	return &CambridgeDictionary{baseURL: cambridgeURL, ua: userAgent}
25
}
26
27
func (d *CambridgeDictionary) fetch(word string) (io.ReadCloser, error) {
28
	req, err := http.NewRequest("GET", d.baseURL+strings.ToLower(word), nil)
29
	if err != nil {
30
		return nil, err
31
	}
32
	req.Header.Set("User-Agent", d.ua)
33
	resp, err := http.DefaultClient.Do(req)
34
	if err != nil {
35
		return nil, err
36
	}
37
	if resp.StatusCode != 200 {
38
		_ = resp.Body.Close()
39
		return nil, fmt.Errorf("word not found (HTTP %d)", resp.StatusCode)
40
	}
41
	return resp.Body, nil
42
}
43
44
func redirectBase(entry *Entry) string {
45
	if len(entry.POSBlocks) == 0 || len(entry.POSBlocks[0].Senses) == 0 {
46
		return ""
47
	}
48
	def := strings.Join(strings.Fields(entry.POSBlocks[0].Senses[0].Definition), " ")
49
	prefixes := []string{
50
		"past simple of ", "past tense of ", "past participle of ",
51
		"present participle of ", "third person singular of ", "3rd person singular of ",
52
	}
53
	for _, p := range prefixes {
54
		if i := strings.Index(strings.ToLower(def), p); i >= 0 {
55
			base := strings.TrimSpace(def[i+len(p):])
56
			if space := strings.IndexAny(base, " \t\n"); space > 0 {
57
				base = base[:space]
58
			}
59
			return strings.ToLower(base)
60
		}
61
	}
62
	return ""
63
}
64
65
func (d *CambridgeDictionary) Lookup(word string) (*Entry, error) {
66
	body, err := d.fetch(word)
67
	if err != nil {
68
		return nil, err
69
	}
70
	defer body.Close() //nolint:errcheck
71
	entry, err := d.Scrape(word, body)
72
	if err != nil {
73
		return nil, err
74
	}
75
	if base := redirectBase(entry); base != "" && base != word {
76
		redirected, err := d.Lookup(base)
77
		if err != nil {
78
			return nil, err
79
		}
80
		redirected.Word = word
81
		redirected.RedirectWord = base
82
		return redirected, nil
83
	}
84
	return entry, nil
85
}
86
87
func (d *CambridgeDictionary) Scrape(word string, r io.Reader) (*Entry, error) {
88
	doc, err := html.Parse(r)
89
	if err != nil {
90
		return nil, err
91
	}
92
93
	cald4 := findFirstDescendantWithAttr(doc, "div", "data-id", "cald4")
94
	if cald4 == nil {
95
		return nil, fmt.Errorf("word not found")
96
	}
97
98
	bodies := findAllDescendantsWithClass(cald4, "div", "entry-body__el")
99
	if len(bodies) == 0 {
100
		return nil, fmt.Errorf("word not found")
101
	}
102
103
	var blocks []POSBlock
104
	for _, body := range bodies {
105
		blocks = append(blocks, parsePOSBlock(body))
106
	}
107
	return &Entry{Word: word, POSBlocks: blocks}, nil
108
}
109
110
func hasClass(n *html.Node, class string) bool {
111
	for _, a := range n.Attr {
112
		if a.Key == "class" {
113
			if slices.Contains(strings.Fields(a.Val), class) {
114
				return true
115
			}
116
		}
117
	}
118
	return false
119
}
120
121
func textContent(n *html.Node) string {
122
	var b strings.Builder
123
	var walk func(*html.Node)
124
	walk = func(n *html.Node) {
125
		if n.Type == html.TextNode {
126
			b.WriteString(n.Data)
127
		}
128
		for c := n.FirstChild; c != nil; c = c.NextSibling {
129
			walk(c)
130
		}
131
	}
132
	walk(n)
133
	return strings.TrimSpace(b.String())
134
}
135
136
func findFirstDescendantWithClass(n *html.Node, tag, class string) *html.Node {
137
	if n.Type == html.ElementNode && n.Data == tag && hasClass(n, class) {
138
		return n
139
	}
140
	for c := n.FirstChild; c != nil; c = c.NextSibling {
141
		if found := findFirstDescendantWithClass(c, tag, class); found != nil {
142
			return found
143
		}
144
	}
145
	return nil
146
}
147
148
func findAllDescendantsWithClass(n *html.Node, tag, class string) []*html.Node {
149
	var res []*html.Node
150
	var walk func(*html.Node)
151
	walk = func(n *html.Node) {
152
		if n.Type == html.ElementNode && n.Data == tag && hasClass(n, class) {
153
			res = append(res, n)
154
			return
155
		}
156
		for c := n.FirstChild; c != nil; c = c.NextSibling {
157
			walk(c)
158
		}
159
	}
160
	walk(n)
161
	return res
162
}
163
164
func findFirstDescendantWithAttr(n *html.Node, tag, key, val string) *html.Node {
165
	if n.Type == html.ElementNode && n.Data == tag {
166
		for _, a := range n.Attr {
167
			if a.Key == key && a.Val == val {
168
				return n
169
			}
170
		}
171
	}
172
	for c := n.FirstChild; c != nil; c = c.NextSibling {
173
		if found := findFirstDescendantWithAttr(c, tag, key, val); found != nil {
174
			return found
175
		}
176
	}
177
	return nil
178
}
179
180
func parseIPA(ph *html.Node, region string) string {
181
	el := findFirstDescendantWithClass(ph, "span", region)
182
	if el == nil {
183
		return ""
184
	}
185
	ipaEl := findFirstDescendantWithClass(el, "span", "ipa")
186
	if ipaEl == nil {
187
		return ""
188
	}
189
	return textContent(ipaEl)
190
}
191
192
func parsePOSBlock(body *html.Node) POSBlock {
193
	ph := findFirstDescendantWithClass(body, "div", "pos-header")
194
195
	pos := ""
196
	if el := findFirstDescendantWithClass(ph, "span", "pos"); el != nil {
197
		pos = textContent(el)
198
	}
199
200
	ipa := parseIPA(ph, "us")
201
	if ipa == "" {
202
		ipa = parseIPA(ph, "uk")
203
	}
204
205
	defBlocks := findAllDescendantsWithClass(body, "div", "def-block")
206
	var senses []Sense
207
	for _, db := range defBlocks {
208
		def := ""
209
		if el := findFirstDescendantWithClass(db, "div", "def"); el != nil {
210
			def = strings.TrimRight(textContent(el), ":")
211
		}
212
		exampDivs := findAllDescendantsWithClass(db, "div", "examp")
213
		var exs []string
214
		for i, ed := range exampDivs {
215
			if i >= 3 {
216
				break
217
			}
218
			if eg := findFirstDescendantWithClass(ed, "span", "eg"); eg != nil {
219
				exs = append(exs, textContent(eg))
220
			}
221
		}
222
		senses = append(senses, Sense{Definition: def, Examples: exs})
223
	}
224
225
	return POSBlock{POS: pos, IPA: ipa, Senses: senses}
226
}