Fix pagebreak-regex and range in which pagebreaks are searched.

This commit is contained in:
tastytea 2021-06-05 15:20:40 +02:00
parent f1a0015f28
commit bdf9a86651
Signed by: tastytea
GPG Key ID: CFC39497F1B26E07
1 changed files with 5 additions and 4 deletions

View File

@ -129,8 +129,9 @@ void cleanup_text(string &text)
{
static const boost::regex re_header_start{"<[hH][1-6]"};
static const boost::regex re_header_end{"</[hH][1-6]"};
static const boost::regex re_pagebreak{".+pagebreak.+(title|aria-label)"
"=\"([[:alnum:]]+)\".*"};
static const boost::regex re_pagebreak{"[^>]+pagebreak[^>]+"
"(title|aria-label)"
"=\"([[:alnum:]]+)\""};
size_t pos{};
while ((pos = text.find('<', pos)) != string::npos)
@ -149,8 +150,8 @@ void cleanup_text(string &text)
{
auto endpos{text.find('>')};
boost::match_results<const char *> match;
if (boost::regex_search(text.substr(pos, endpos).data(), match,
re_pagebreak))
if (boost::regex_search(text.substr(pos, endpos - pos).data(),
match, re_pagebreak))
{
// FIXME: -fsanitize=address is complaining about this. ↓ 🤷
// Could not reproduce it.