Don't inject page numbers and headline-markers into the text.
Some checks failed
continuous-integration/drone/push Build is failing

The metadata is recorded in position → data pairs.

Closes: #13
This commit is contained in:
tastytea 2021-06-06 21:26:09 +02:00
parent ebb8b63830
commit b8431019b7
Signed by: tastytea
GPG Key ID: CFC39497F1B26E07
2 changed files with 91 additions and 82 deletions

View File

@ -27,6 +27,8 @@
#include <algorithm> #include <algorithm>
#include <array> #include <array>
#include <iterator>
#include <memory>
#include <string> #include <string>
#include <string_view> #include <string_view>
#include <vector> #include <vector>
@ -83,18 +85,18 @@ std::vector<match> search(const fs::path &filepath,
for (const auto &entry : epub_filepaths) for (const auto &entry : epub_filepaths)
{ {
DEBUGLOG << "Processing " << entry; DEBUGLOG << "Processing " << entry;
auto document{zip::read_file(filepath, entry)}; file_in_epub file;
if (!opts.raw)
{ {
cleanup_text(document); const auto document{zip::read_file(filepath, entry)};
document = helpers::unescape_html(document); if (!opts.raw)
{
file = cleanup_text(helpers::unescape_html(document));
}
} }
string::const_iterator begin{document.begin()}; string::const_iterator begin{file.text.begin()};
string::const_iterator end{document.end()}; string::const_iterator end{file.text.end()};
boost::match_results<string::const_iterator> match_result; boost::match_results<string::const_iterator> match_result;
string last_headline;
string last_page;
while (boost::regex_search(begin, end, match_result, re, while (boost::regex_search(begin, end, match_result, re,
boost::match_default)) boost::match_default))
@ -104,18 +106,10 @@ std::vector<match> search(const fs::path &filepath,
match.filepath_inside = entry; match.filepath_inside = entry;
match.text = match_result[0]; match.text = match_result[0];
match.context = context(match_result, opts.context); match.context = context(match_result, opts.context);
const auto current_headline{headline(match_result.prefix().str())}; const auto pos = static_cast<size_t>(
if (!current_headline.empty()) std::distance(begin, match_result[0].begin()));
{ match.headline = headline(file, pos);
last_headline = current_headline; match.page = page(file, pos);
}
match.headline = last_headline;
const auto current_page{page(match_result.prefix().str())};
if (!current_page.empty())
{
last_page = current_page;
}
match.page = last_page;
matches.emplace_back(match); matches.emplace_back(match);
begin = match_result[0].second; begin = match_result[0].second;
@ -125,75 +119,84 @@ std::vector<match> search(const fs::path &filepath,
return matches; return matches;
} }
void cleanup_text(string &text) file_in_epub cleanup_text(const std::string_view text)
{ {
string output{text};
static const boost::regex re_header_start{"<[hH][1-6]"}; static const boost::regex re_header_start{"<[hH][1-6]"};
static const boost::regex re_header_end{"</[hH][1-6]"}; static const boost::regex re_header_end{"</[hH][1-6]"};
static const boost::regex re_pagebreak{"[^>]+pagebreak[^>]+" static const boost::regex re_pagebreak{"[^>]+pagebreak[^>]+"
"(title|aria-label)" "(title|aria-label)"
"=\"([[:alnum:]]+)\""}; "=\"([[:alnum:]]+)\""};
size_t pos{}; // TODO: Make this more efficient, 3 → 1;
while ((pos = text.find('<', pos)) != string::npos) size_t pos{0};
while ((pos = output.find('\r', pos)) != string::npos)
{ {
auto endpos{text.find('>', pos) + 1}; output.erase(pos, 1);
}
// Mark headlines. We need them later on. pos = 0;
string replacement; while ((pos = output.find_first_of("\n\t", pos)) != string::npos)
if (boost::regex_match(text.substr(pos, 3), re_header_start)) {
output.replace(pos, 1, " ");
}
pos = 0;
while ((pos = output.find(" ", pos)) != string::npos)
{
output.replace(pos, 2, " ");
}
pos = 0;
file_in_epub file;
size_t headline_start{string::npos};
while ((pos = output.find('<', pos)) != string::npos)
{
auto endpos{output.find('>', pos) + 1};
if (boost::regex_match(output.substr(pos, 3), re_header_start))
{ {
replacement = "<H>"; headline_start = pos;
} }
else if (boost::regex_match(text.substr(pos, 4), re_header_end)) else if (boost::regex_match(output.substr(pos, 4), re_header_end))
{ {
replacement = "</H>"; if (headline_start != string::npos)
{
file.headlines.insert(
{headline_start,
output.substr(headline_start, pos - headline_start)});
headline_start = string::npos;
}
} }
else if (text.substr(pos, 6) == "<span ") else if (output.substr(pos, 6) == "<span ")
{ {
boost::match_results<string::const_iterator> match; boost::match_results<string::const_iterator> match;
using it_size_t = string::const_iterator::difference_type; using it_size_t = string::const_iterator::difference_type;
string::const_iterator begin{text.begin() string::const_iterator begin{output.begin()
+ static_cast<it_size_t>(pos)}; + static_cast<it_size_t>(pos)};
string::const_iterator end{text.begin() string::const_iterator end{output.begin()
+ static_cast<it_size_t>(endpos)}; + static_cast<it_size_t>(endpos)};
if (boost::regex_search(begin, end, match, re_pagebreak)) if (boost::regex_search(begin, end, match, re_pagebreak))
{ {
replacement = format("<PAGE {0:s}>", match[2].str()); file.pages.insert({pos, match[2].str()});
} }
} }
else if (text.substr(pos, 7) == "<style " else if (output.substr(pos, 7) == "<style "
|| text.substr(pos, 8) == "<script ") || output.substr(pos, 8) == "<script ")
{ {
if (text.find("/>", pos) > endpos) if (output.find("/>", pos) > endpos)
{ {
endpos = text.find('>', endpos) + 1; endpos = output.find('>', endpos) + 1;
} }
} }
DEBUGLOG << "Replacing '" << text.substr(pos, endpos - pos) output.erase(pos, endpos - pos);
<< "' with '" << replacement << "'.";
text.replace(pos, endpos - pos, replacement);
pos += replacement.length();
} }
pos = 0; file.text = output;
while ((pos = text.find('\r', pos)) != string::npos)
{
text.erase(pos, 1);
}
pos = 0; return file;
while ((pos = text.find_first_of("\n\t", pos)) != string::npos)
{
text.replace(pos, 1, " ");
}
pos = 0;
while ((pos = text.find(" ", pos)) != string::npos)
{
text.replace(pos, 2, " ");
}
} }
match_context context(const boost::match_results<string::const_iterator> &match, match_context context(const boost::match_results<string::const_iterator> &match,
@ -264,39 +267,36 @@ match_context context(const boost::match_results<string::const_iterator> &match,
return {before, after}; return {before, after};
} }
string headline(const std::string_view prefix) std::string headline(const file_in_epub &file, const size_t pos)
{ {
size_t pos{prefix.length()}; std::string_view last;
if ((pos = prefix.rfind("<H>", pos)) != std::string_view::npos)
for (const auto &pair : file.headlines)
{ {
pos += 3; if (pair.first > pos)
string result{prefix.substr(pos, prefix.find('<', pos) - pos)};
while (helpers::is_whitespace(*result.begin()))
{ {
result.erase(0, 1); break;
} }
while (helpers::is_whitespace(*result.rbegin())) last = pair.second;
{
result.erase(result.size() - 1);
}
return result;
} }
return {}; return last.data();
} }
string page(const std::string_view prefix) string page(const file_in_epub &file, const size_t pos)
{ {
size_t pos{prefix.length()}; std::string_view last;
while ((pos = prefix.rfind("<PAGE ", pos)) != std::string_view::npos)
for (const auto &pair : file.pages)
{ {
pos += 6; if (pair.first > pos)
return string{prefix.substr(pos, prefix.find('>', pos) - pos)}; {
break;
}
last = pair.second;
} }
return {}; return last.data();
} }
} // namespace epubgrep::search } // namespace epubgrep::search

View File

@ -22,7 +22,9 @@
#include <boost/regex.hpp> #include <boost/regex.hpp>
#include <cstddef>
#include <cstdint> #include <cstdint>
#include <map>
#include <string> #include <string>
#include <string_view> #include <string_view>
#include <utility> #include <utility>
@ -52,13 +54,20 @@ struct settings
std::uint64_t context{0}; std::uint64_t context{0};
}; };
struct file_in_epub
{
std::string text;
std::map<size_t, std::string> headlines;
std::map<size_t, std::string> pages;
};
//! Search file, return matches. //! Search file, return matches.
[[nodiscard]] std::vector<match> search(const fs::path &filepath, [[nodiscard]] std::vector<match> search(const fs::path &filepath,
std::string_view regex, std::string_view regex,
const settings &opts); const settings &opts);
//! Strip HTML, remove newlines, condense spaces. //! Strip HTML, remove newlines, condense spaces.
void cleanup_text(std::string &text); [[nodiscard]] file_in_epub cleanup_text(std::string_view text);
//! Return words before and after the match. //! Return words before and after the match.
[[nodiscard]] match_context [[nodiscard]] match_context
@ -66,10 +75,10 @@ context(const boost::match_results<std::string::const_iterator> &match,
std::uint64_t words); std::uint64_t words);
//! Return last headline if possible. //! Return last headline if possible.
[[nodiscard]] std::string headline(std::string_view prefix); [[nodiscard]] std::string headline(const file_in_epub &file, size_t pos);
//! Return current page if possible. //! Return current page if possible.
[[nodiscard]] std::string page(std::string_view prefix); [[nodiscard]] std::string page(const file_in_epub &file, size_t pos);
} // namespace epubgrep::search } // namespace epubgrep::search