|
1
|
package main |
|
2
|
|
|
3
|
import ( |
|
4
|
"fmt" |
|
5
|
"io" |
|
6
|
"net/http" |
|
7
|
"slices" |
|
8
|
"strings" |
|
9
|
|
|
10
|
"golang.org/x/net/html" |
|
11
|
) |
|
12
|
|
|
13
|
const ( |
|
14
|
cambridgeURL = "https://dictionary.cambridge.org/dictionary/english/" |
|
15
|
userAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" |
|
16
|
) |
|
17
|
|
|
18
|
type CambridgeDictionary struct { |
|
19
|
baseURL string |
|
20
|
ua string |
|
21
|
} |
|
22
|
|
|
23
|
func NewCambridgeDictionary() *CambridgeDictionary { |
|
24
|
return &CambridgeDictionary{baseURL: cambridgeURL, ua: userAgent} |
|
25
|
} |
|
26
|
|
|
27
|
func (d *CambridgeDictionary) fetch(word string) (io.ReadCloser, error) { |
|
28
|
req, err := http.NewRequest("GET", d.baseURL+strings.ToLower(word), nil) |
|
29
|
if err != nil { |
|
30
|
return nil, err |
|
31
|
} |
|
32
|
req.Header.Set("User-Agent", d.ua) |
|
33
|
resp, err := http.DefaultClient.Do(req) |
|
34
|
if err != nil { |
|
35
|
return nil, err |
|
36
|
} |
|
37
|
if resp.StatusCode != 200 { |
|
38
|
_ = resp.Body.Close() |
|
39
|
return nil, fmt.Errorf("word not found (HTTP %d)", resp.StatusCode) |
|
40
|
} |
|
41
|
return resp.Body, nil |
|
42
|
} |
|
43
|
|
|
44
|
func redirectBase(entry *Entry) string { |
|
45
|
if len(entry.POSBlocks) == 0 || len(entry.POSBlocks[0].Senses) == 0 { |
|
46
|
return "" |
|
47
|
} |
|
48
|
def := strings.Join(strings.Fields(entry.POSBlocks[0].Senses[0].Definition), " ") |
|
49
|
prefixes := []string{ |
|
50
|
"past simple of ", "past tense of ", "past participle of ", |
|
51
|
"present participle of ", "third person singular of ", "3rd person singular of ", |
|
52
|
} |
|
53
|
for _, p := range prefixes { |
|
54
|
if i := strings.Index(strings.ToLower(def), p); i >= 0 { |
|
55
|
base := strings.TrimSpace(def[i+len(p):]) |
|
56
|
if space := strings.IndexAny(base, " \t\n"); space > 0 { |
|
57
|
base = base[:space] |
|
58
|
} |
|
59
|
return strings.ToLower(base) |
|
60
|
} |
|
61
|
} |
|
62
|
return "" |
|
63
|
} |
|
64
|
|
|
65
|
func (d *CambridgeDictionary) Lookup(word string) (*Entry, error) { |
|
66
|
body, err := d.fetch(word) |
|
67
|
if err != nil { |
|
68
|
return nil, err |
|
69
|
} |
|
70
|
defer body.Close() //nolint:errcheck |
|
71
|
entry, err := d.Scrape(word, body) |
|
72
|
if err != nil { |
|
73
|
return nil, err |
|
74
|
} |
|
75
|
if base := redirectBase(entry); base != "" && base != word { |
|
76
|
redirected, err := d.Lookup(base) |
|
77
|
if err != nil { |
|
78
|
return nil, err |
|
79
|
} |
|
80
|
redirected.Word = word |
|
81
|
redirected.RedirectWord = base |
|
82
|
return redirected, nil |
|
83
|
} |
|
84
|
return entry, nil |
|
85
|
} |
|
86
|
|
|
87
|
func (d *CambridgeDictionary) Scrape(word string, r io.Reader) (*Entry, error) { |
|
88
|
doc, err := html.Parse(r) |
|
89
|
if err != nil { |
|
90
|
return nil, err |
|
91
|
} |
|
92
|
|
|
93
|
cald4 := findFirstDescendantWithAttr(doc, "div", "data-id", "cald4") |
|
94
|
if cald4 == nil { |
|
95
|
return nil, fmt.Errorf("word not found") |
|
96
|
} |
|
97
|
|
|
98
|
bodies := findAllDescendantsWithClass(cald4, "div", "entry-body__el") |
|
99
|
if len(bodies) == 0 { |
|
100
|
return nil, fmt.Errorf("word not found") |
|
101
|
} |
|
102
|
|
|
103
|
var blocks []POSBlock |
|
104
|
for _, body := range bodies { |
|
105
|
blocks = append(blocks, parsePOSBlock(body)) |
|
106
|
} |
|
107
|
return &Entry{Word: word, POSBlocks: blocks}, nil |
|
108
|
} |
|
109
|
|
|
110
|
func hasClass(n *html.Node, class string) bool { |
|
111
|
for _, a := range n.Attr { |
|
112
|
if a.Key == "class" { |
|
113
|
if slices.Contains(strings.Fields(a.Val), class) { |
|
114
|
return true |
|
115
|
} |
|
116
|
} |
|
117
|
} |
|
118
|
return false |
|
119
|
} |
|
120
|
|
|
121
|
func textContent(n *html.Node) string { |
|
122
|
var b strings.Builder |
|
123
|
var walk func(*html.Node) |
|
124
|
walk = func(n *html.Node) { |
|
125
|
if n.Type == html.TextNode { |
|
126
|
b.WriteString(n.Data) |
|
127
|
} |
|
128
|
for c := n.FirstChild; c != nil; c = c.NextSibling { |
|
129
|
walk(c) |
|
130
|
} |
|
131
|
} |
|
132
|
walk(n) |
|
133
|
return strings.TrimSpace(b.String()) |
|
134
|
} |
|
135
|
|
|
136
|
func findFirstDescendantWithClass(n *html.Node, tag, class string) *html.Node { |
|
137
|
if n.Type == html.ElementNode && n.Data == tag && hasClass(n, class) { |
|
138
|
return n |
|
139
|
} |
|
140
|
for c := n.FirstChild; c != nil; c = c.NextSibling { |
|
141
|
if found := findFirstDescendantWithClass(c, tag, class); found != nil { |
|
142
|
return found |
|
143
|
} |
|
144
|
} |
|
145
|
return nil |
|
146
|
} |
|
147
|
|
|
148
|
func findAllDescendantsWithClass(n *html.Node, tag, class string) []*html.Node { |
|
149
|
var res []*html.Node |
|
150
|
var walk func(*html.Node) |
|
151
|
walk = func(n *html.Node) { |
|
152
|
if n.Type == html.ElementNode && n.Data == tag && hasClass(n, class) { |
|
153
|
res = append(res, n) |
|
154
|
return |
|
155
|
} |
|
156
|
for c := n.FirstChild; c != nil; c = c.NextSibling { |
|
157
|
walk(c) |
|
158
|
} |
|
159
|
} |
|
160
|
walk(n) |
|
161
|
return res |
|
162
|
} |
|
163
|
|
|
164
|
func findFirstDescendantWithAttr(n *html.Node, tag, key, val string) *html.Node { |
|
165
|
if n.Type == html.ElementNode && n.Data == tag { |
|
166
|
for _, a := range n.Attr { |
|
167
|
if a.Key == key && a.Val == val { |
|
168
|
return n |
|
169
|
} |
|
170
|
} |
|
171
|
} |
|
172
|
for c := n.FirstChild; c != nil; c = c.NextSibling { |
|
173
|
if found := findFirstDescendantWithAttr(c, tag, key, val); found != nil { |
|
174
|
return found |
|
175
|
} |
|
176
|
} |
|
177
|
return nil |
|
178
|
} |
|
179
|
|
|
180
|
func parseIPA(ph *html.Node, region string) string { |
|
181
|
el := findFirstDescendantWithClass(ph, "span", region) |
|
182
|
if el == nil { |
|
183
|
return "" |
|
184
|
} |
|
185
|
ipaEl := findFirstDescendantWithClass(el, "span", "ipa") |
|
186
|
if ipaEl == nil { |
|
187
|
return "" |
|
188
|
} |
|
189
|
return textContent(ipaEl) |
|
190
|
} |
|
191
|
|
|
192
|
func parsePOSBlock(body *html.Node) POSBlock { |
|
193
|
ph := findFirstDescendantWithClass(body, "div", "pos-header") |
|
194
|
|
|
195
|
pos := "" |
|
196
|
if el := findFirstDescendantWithClass(ph, "span", "pos"); el != nil { |
|
197
|
pos = textContent(el) |
|
198
|
} |
|
199
|
|
|
200
|
ipa := parseIPA(ph, "us") |
|
201
|
if ipa == "" { |
|
202
|
ipa = parseIPA(ph, "uk") |
|
203
|
} |
|
204
|
|
|
205
|
defBlocks := findAllDescendantsWithClass(body, "div", "def-block") |
|
206
|
var senses []Sense |
|
207
|
for _, db := range defBlocks { |
|
208
|
def := "" |
|
209
|
if el := findFirstDescendantWithClass(db, "div", "def"); el != nil { |
|
210
|
def = strings.TrimRight(textContent(el), ":") |
|
211
|
} |
|
212
|
exampDivs := findAllDescendantsWithClass(db, "div", "examp") |
|
213
|
var exs []string |
|
214
|
for i, ed := range exampDivs { |
|
215
|
if i >= 3 { |
|
216
|
break |
|
217
|
} |
|
218
|
if eg := findFirstDescendantWithClass(ed, "span", "eg"); eg != nil { |
|
219
|
exs = append(exs, textContent(eg)) |
|
220
|
} |
|
221
|
} |
|
222
|
senses = append(senses, Sense{Definition: def, Examples: exs}) |
|
223
|
} |
|
224
|
|
|
225
|
return POSBlock{POS: pos, IPA: ipa, Senses: senses} |
|
226
|
} |