diff --git a/internal/reader/icon/finder.go b/internal/reader/icon/finder.go index 2ff96031..b6b5ee7d 100644 --- a/internal/reader/icon/finder.go +++ b/internal/reader/icon/finder.go @@ -64,13 +64,17 @@ func (f *iconFinder) findIcon() (*model.Icon, error) { } } - if icon, err := f.fetchIconsFromHTMLDocument(); err != nil { - slog.Debug("Unable to fetch icons from HTML document", - slog.String("website_url", f.websiteURL), - slog.Any("error", err), - ) - } else if icon != nil { - return icon, nil + // Try the website URL first, then fall back to the root URL if no icon is found. + // The website URL may include a subdirectory (e.g., https://example.org/subfolder/), and icons can be referenced relative to that path. + for _, documentURL := range []string{f.websiteURL, urllib.RootURL(f.websiteURL)} { + if icon, err := f.fetchIconsFromHTMLDocument(documentURL); err != nil { + slog.Debug("Unable to fetch icons from HTML document", + slog.String("document_url", documentURL), + slog.Any("error", err), + ) + } else if icon != nil { + return icon, nil + } } return f.fetchDefaultIcon() @@ -94,14 +98,12 @@ func (f *iconFinder) fetchDefaultIcon() (*model.Icon, error) { return icon, nil } -func (f *iconFinder) fetchIconsFromHTMLDocument() (*model.Icon, error) { +func (f *iconFinder) fetchIconsFromHTMLDocument(documentURL string) (*model.Icon, error) { slog.Debug("Searching icons from HTML document", - slog.String("website_url", f.websiteURL), + slog.String("document_url", documentURL), ) - rootURL := urllib.RootURL(f.websiteURL) - - responseHandler := fetcher.NewResponseHandler(f.requestBuilder.ExecuteRequest(rootURL)) + responseHandler := fetcher.NewResponseHandler(f.requestBuilder.ExecuteRequest(documentURL)) defer responseHandler.Close() if localizedError := responseHandler.LocalizedError(); localizedError != nil { @@ -109,6 +111,7 @@ func (f *iconFinder) fetchIconsFromHTMLDocument() (*model.Icon, error) { } iconURLs, err := findIconURLsFromHTMLDocument( + documentURL, responseHandler.Body(config.Opts.HTTPClientMaxBodySize()), responseHandler.ContentType(), ) @@ -117,32 +120,27 @@ func (f *iconFinder) fetchIconsFromHTMLDocument() (*model.Icon, error) { } slog.Debug("Searched icon from HTML document", - slog.String("website_url", f.websiteURL), + slog.String("document_url", documentURL), slog.String("icon_urls", strings.Join(iconURLs, ",")), ) for _, iconURL := range iconURLs { if strings.HasPrefix(iconURL, "data:") { slog.Debug("Found icon with data URL", - slog.String("website_url", f.websiteURL), + slog.String("document_url", documentURL), ) return parseImageDataURL(iconURL) } - iconURL, err = urllib.AbsoluteURL(f.websiteURL, iconURL) - if err != nil { - return nil, fmt.Errorf(`icon: unable to convert icon URL to absolute URL: %w`, err) - } - if icon, err := f.downloadIcon(iconURL); err != nil { slog.Debug("Unable to download icon from HTML document", - slog.String("website_url", f.websiteURL), + slog.String("document_url", documentURL), slog.String("icon_url", iconURL), slog.Any("error", err), ) } else if icon != nil { slog.Debug("Downloaded icon from HTML document", - slog.String("website_url", f.websiteURL), + slog.String("document_url", documentURL), slog.String("icon_url", iconURL), ) return icon, nil @@ -195,7 +193,7 @@ func resizeIcon(icon *model.Icon) *model.Icon { } if !slices.Contains([]string{"image/jpeg", "image/png", "image/gif", "image/webp"}, icon.MimeType) { - slog.Info("Icon resize skipped: unsupported MIME type", slog.String("mime_type", icon.MimeType)) + slog.Debug("Icon resize skipped: unsupported MIME type", slog.String("mime_type", icon.MimeType)) return icon } @@ -244,7 +242,7 @@ func resizeIcon(icon *model.Icon) *model.Icon { return icon } -func findIconURLsFromHTMLDocument(body io.Reader, contentType string) ([]string, error) { +func findIconURLsFromHTMLDocument(documentURL string, body io.Reader, contentType string) ([]string, error) { htmlDocumentReader, err := encoding.NewCharsetReader(body, contentType) if err != nil { return nil, fmt.Errorf("icon: unable to create charset reader: %w", err) @@ -268,11 +266,20 @@ func findIconURLsFromHTMLDocument(body io.Reader, contentType string) ([]string, for _, s := range doc.Find(query).EachIter() { href, _ := s.Attr("href") - if iconURL := strings.TrimSpace(href); iconURL != "" { - iconURLs = append(iconURLs, iconURL) + href = strings.TrimSpace(href) + if href == "" { + continue + } + + if absoluteIconURL, err := urllib.AbsoluteURL(documentURL, href); err != nil { + slog.Warn("Unable to convert icon URL to absolute URL", slog.Any("error", err), slog.String("icon_href", href)) + } else { + iconURLs = append(iconURLs, absoluteIconURL) slog.Debug("Found icon URL in HTML document", slog.String("query", query), - slog.String("icon_url", iconURL)) + slog.String("icon_href", href), + slog.String("absolute_icon_url", absoluteIconURL), + ) } } } diff --git a/internal/reader/icon/finder_test.go b/internal/reader/icon/finder_test.go index 161c3b65..e441416e 100644 --- a/internal/reader/icon/finder_test.go +++ b/internal/reader/icon/finder_test.go @@ -123,16 +123,16 @@ func TestFindIconURLsFromHTMLDocument_MultipleIcons(t *testing.T) { ` - iconURLs, err := findIconURLsFromHTMLDocument(strings.NewReader(html), "text/html") + iconURLs, err := findIconURLsFromHTMLDocument("https://example.org", strings.NewReader(html), "text/html") if err != nil { t.Fatal(err) } expected := []string{ - "/favicon.ico", - "/shortcut-favicon.ico", - "/icon-shortcut.ico", - "/apple-touch-icon.png", + "https://example.org/favicon.ico", + "https://example.org/shortcut-favicon.ico", + "https://example.org/icon-shortcut.ico", + "https://example.org/apple-touch-icon.png", } if len(iconURLs) != len(expected) { @@ -155,22 +155,22 @@ func TestFindIconURLsFromHTMLDocument_CaseInsensitiveRel(t *testing.T) { - + ` - iconURLs, err := findIconURLsFromHTMLDocument(strings.NewReader(html), "text/html") + iconURLs, err := findIconURLsFromHTMLDocument("https://example.org/folder/", strings.NewReader(html), "text/html") if err != nil { t.Fatal(err) } expected := []string{ - "/favicon1.ico", - "/favicon2.ico", - "/favicon3.ico", - "/favicon4.ico", - "/favicon5.ico", - "/favicon6.ico", + "https://example.org/favicon1.ico", + "https://example.org/favicon2.ico", + "https://example.org/favicon3.ico", + "https://example.org/favicon4.ico", + "https://example.org/favicon5.ico", + "https://example.org/folder/favicon6.ico", } if len(iconURLs) != len(expected) { @@ -194,7 +194,7 @@ func TestFindIconURLsFromHTMLDocument_NoIcons(t *testing.T) { ` - iconURLs, err := findIconURLsFromHTMLDocument(strings.NewReader(html), "text/html") + iconURLs, err := findIconURLsFromHTMLDocument("https://example.org", strings.NewReader(html), "text/html") if err != nil { t.Fatal(err) } @@ -215,12 +215,12 @@ func TestFindIconURLsFromHTMLDocument_EmptyHref(t *testing.T) { ` - iconURLs, err := findIconURLsFromHTMLDocument(strings.NewReader(html), "text/html") + iconURLs, err := findIconURLsFromHTMLDocument("https://example.org", strings.NewReader(html), "text/html") if err != nil { t.Fatal(err) } - expected := []string{"/valid-icon.ico"} + expected := []string{"https://example.org/valid-icon.ico"} if len(iconURLs) != len(expected) { t.Fatalf("Expected %d icon URLs, got %d", len(expected), len(iconURLs)) @@ -241,7 +241,7 @@ func TestFindIconURLsFromHTMLDocument_DataURLs(t *testing.T) { ` - iconURLs, err := findIconURLsFromHTMLDocument(strings.NewReader(html), "text/html") + iconURLs, err := findIconURLsFromHTMLDocument("https://example.org/folder", strings.NewReader(html), "text/html") if err != nil { t.Fatal(err) } @@ -250,7 +250,7 @@ func TestFindIconURLsFromHTMLDocument_DataURLs(t *testing.T) { // So both rel="icon" links are found first, then the rel="shortcut icon" link expected := []string{ "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAGAhGAQ+QAAAABJRU5ErkJggg==", - "/regular-icon.ico", + "https://example.org/regular-icon.ico", "data:image/svg+xml,", } @@ -277,17 +277,17 @@ func TestFindIconURLsFromHTMLDocument_RelativeAndAbsoluteURLs(t *testing.T) { ` - iconURLs, err := findIconURLsFromHTMLDocument(strings.NewReader(html), "text/html") + iconURLs, err := findIconURLsFromHTMLDocument("https://example.org/folder/", strings.NewReader(html), "text/html") if err != nil { t.Fatal(err) } expected := []string{ - "/absolute-path.ico", - "relative-path.ico", - "../parent-dir.ico", + "https://example.org/absolute-path.ico", + "https://example.org/folder/relative-path.ico", + "https://example.org/parent-dir.ico", "https://example.com/external.ico", - "//cdn.example.com/protocol-relative.ico", + "https://cdn.example.com/protocol-relative.ico", } if len(iconURLs) != len(expected) { @@ -311,7 +311,7 @@ func TestFindIconURLsFromHTMLDocument_InvalidHTML(t *testing.T) { ` - iconURLs, err := findIconURLsFromHTMLDocument(strings.NewReader(html), "text/html") + iconURLs, err := findIconURLsFromHTMLDocument("https://example.org", strings.NewReader(html), "text/html") if err != nil { t.Fatal(err) } @@ -324,7 +324,7 @@ func TestFindIconURLsFromHTMLDocument_InvalidHTML(t *testing.T) { // Should at least find the valid ones foundValidIcon := false for _, url := range iconURLs { - if url == "/valid-before-error.ico" || url == "/valid-after-error.ico" { + if url == "https://example.org/valid-before-error.ico" || url == "https://example.org/valid-after-error.ico" { foundValidIcon = true break } @@ -336,7 +336,7 @@ func TestFindIconURLsFromHTMLDocument_InvalidHTML(t *testing.T) { } func TestFindIconURLsFromHTMLDocument_EmptyDocument(t *testing.T) { - iconURLs, err := findIconURLsFromHTMLDocument(strings.NewReader(""), "text/html") + iconURLs, err := findIconURLsFromHTMLDocument("https://example.org", strings.NewReader(""), "text/html") if err != nil { t.Fatal(err) } diff --git a/internal/urllib/url.go b/internal/urllib/url.go index c7ad671b..e2bd8360 100644 --- a/internal/urllib/url.go +++ b/internal/urllib/url.go @@ -18,7 +18,7 @@ func IsAbsoluteURL(link string) bool { return u.IsAbs() } -// GetAbsoluteURL return the absolute form of `input` is possible, as well as its parser form. +// GetAbsoluteURL returns the absolute form of `input` if possible, as well as its parsed form. func GetAbsoluteURL(input string) (string, *url.URL, error) { if strings.HasPrefix(input, "//") { return "https:" + input, nil, nil