Wrap headlines in <H> and </H> during cleanup.
This commit is contained in:
parent
8ab7d0f655
commit
bb4a4c719f
@ -72,8 +72,7 @@ epubgrep does not operate on lines, but on whole files. This means you can
|
||||
search for text spanning multiple lines. All newlines will be replaced by
|
||||
spaces.
|
||||
|
||||
HTML will be stripped (except headlines) and newlines will be removed unless
|
||||
*--raw* is specified.
|
||||
HTML will be stripped newlines will be removed unless *--raw* is specified.
|
||||
|
||||
=== Configuration
|
||||
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include "zip.hpp"
|
||||
|
||||
#include <boost/regex.hpp>
|
||||
#include <boost/regex/v4/regex_match.hpp>
|
||||
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
@ -99,18 +100,24 @@ void cleanup_text(std::string &text)
|
||||
{
|
||||
for (size_t pos{}; pos != std::string::npos; pos = text.find('<', pos))
|
||||
{
|
||||
// Don't strip headlines. We need them later on.
|
||||
if (text[pos + 1] == 'h' || text.substr(pos + 1, 2) == "/h")
|
||||
// Mark headlines. We need them later on.
|
||||
std::string replacement;
|
||||
if (boost::regex_match(text.substr(pos, 3), boost::regex{"<[hH][1-6]"}))
|
||||
{
|
||||
++pos;
|
||||
continue;
|
||||
replacement = "<H>";
|
||||
}
|
||||
text.erase(pos, text.find('>', pos) + 1 - pos);
|
||||
else if (boost::regex_match(text.substr(pos, 3),
|
||||
boost::regex{"</[hH]"}))
|
||||
{
|
||||
replacement = "</H>";
|
||||
}
|
||||
text.replace(pos, text.find('>', pos) + 1 - pos, replacement);
|
||||
pos += replacement.length();
|
||||
}
|
||||
|
||||
for (size_t pos{}; pos != std::string::npos; pos = text.find('\r', pos))
|
||||
{
|
||||
text.replace(pos, 1, "");
|
||||
text.erase(pos, 1);
|
||||
}
|
||||
|
||||
for (size_t pos{}; pos != std::string::npos; pos = text.find('\n', pos))
|
||||
@ -154,15 +161,10 @@ context(const boost::match_results<std::string::const_iterator> &match,
|
||||
[[nodiscard]] std::string headline(const std::string_view prefix)
|
||||
{
|
||||
size_t pos{prefix.length()};
|
||||
while ((pos = prefix.rfind("<h", pos)) != std::string_view::npos)
|
||||
while ((pos = prefix.rfind("<H>", pos)) != std::string_view::npos)
|
||||
{
|
||||
if (boost::regex_match(std::string{prefix.substr(pos, 4)},
|
||||
boost::regex{"<h[1-6][> ]"}))
|
||||
{
|
||||
pos = prefix.find('>', pos) + 1;
|
||||
return std::string{prefix.substr(pos, prefix.find('<', pos) - pos)};
|
||||
}
|
||||
pos -= 2;
|
||||
pos += 3;
|
||||
return std::string{prefix.substr(pos, prefix.find('<', pos) - pos)};
|
||||
}
|
||||
|
||||
return {};
|
||||
|
Loading…
x
Reference in New Issue
Block a user