From 28503cc3bdc54565dcfe27b719078c788b9576ba Mon Sep 17 00:00:00 2001 From: tastytea Date: Sat, 18 May 2019 01:47:10 +0200 Subject: [PATCH] Only attempt to extract title and description from HTML files. --- src/uri.cpp | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/uri.cpp b/src/uri.cpp index c5444b6..ca5763e 100644 --- a/src/uri.cpp +++ b/src/uri.cpp @@ -82,16 +82,28 @@ const html_extract URI::get() const string URI::extract_title(const string &html) { - smatch match; - regex_search(html, match, regex("([^<]+)")); - return remove_newlines(match[1].str()); + const regex re_htmlfile("\\.(.?html?|xml|rss)$"); + if (_uri.substr(0, 4) == "http" || regex_search(_uri, re_htmlfile)) + { + smatch match; + regex_search(html, match, regex("<title>([^<]+)")); + return remove_newlines(match[1].str()); + } + + return ""; } const string URI::extract_description(const string &html) { - smatch match; - regex_search(html, match, regex("description\"[^>]+content=\"([^\"]+)")); - return remove_newlines(match[1].str()); + const regex re_htmlfile("\\.(.?html?|xml|rss)$"); + if (_uri.substr(0, 4) == "http" || regex_search(_uri, re_htmlfile)) + { + smatch match; + regex_search(html, match, regex("description\"[^>]+content=\"([^\"]+)")); + return remove_newlines(match[1].str()); + } + + return ""; } const string URI::strip_html(const string &html)