Wrap headlines in <H> and </H> during cleanup.

This commit is contained in:
tastytea 2021-05-24 18:08:40 +02:00
parent 8ab7d0f655
commit bb4a4c719f
Signed by: tastytea
GPG Key ID: CFC39497F1B26E07
2 changed files with 17 additions and 16 deletions

View File

@ -72,8 +72,7 @@ epubgrep does not operate on lines, but on whole files. This means you can
search for text spanning multiple lines. All newlines will be replaced by
spaces.
HTML will be stripped (except headlines) and newlines will be removed unless
*--raw* is specified.
HTML will be stripped newlines will be removed unless *--raw* is specified.
=== Configuration

View File

@ -20,6 +20,7 @@
#include "zip.hpp"
#include <boost/regex.hpp>
#include <boost/regex/v4/regex_match.hpp>
#include <algorithm>
#include <string>
@ -99,18 +100,24 @@ void cleanup_text(std::string &text)
{
for (size_t pos{}; pos != std::string::npos; pos = text.find('<', pos))
{
// Don't strip headlines. We need them later on.
if (text[pos + 1] == 'h' || text.substr(pos + 1, 2) == "/h")
// Mark headlines. We need them later on.
std::string replacement;
if (boost::regex_match(text.substr(pos, 3), boost::regex{"<[hH][1-6]"}))
{
++pos;
continue;
replacement = "<H>";
}
text.erase(pos, text.find('>', pos) + 1 - pos);
else if (boost::regex_match(text.substr(pos, 3),
boost::regex{"</[hH]"}))
{
replacement = "</H>";
}
text.replace(pos, text.find('>', pos) + 1 - pos, replacement);
pos += replacement.length();
}
for (size_t pos{}; pos != std::string::npos; pos = text.find('\r', pos))
{
text.replace(pos, 1, "");
text.erase(pos, 1);
}
for (size_t pos{}; pos != std::string::npos; pos = text.find('\n', pos))
@ -154,15 +161,10 @@ context(const boost::match_results<std::string::const_iterator> &match,
[[nodiscard]] std::string headline(const std::string_view prefix)
{
size_t pos{prefix.length()};
while ((pos = prefix.rfind("<h", pos)) != std::string_view::npos)
while ((pos = prefix.rfind("<H>", pos)) != std::string_view::npos)
{
if (boost::regex_match(std::string{prefix.substr(pos, 4)},
boost::regex{"<h[1-6][> ]"}))
{
pos = prefix.find('>', pos) + 1;
return std::string{prefix.substr(pos, prefix.find('<', pos) - pos)};
}
pos -= 2;
pos += 3;
return std::string{prefix.substr(pos, prefix.find('<', pos) - pos)};
}
return {};