Don't inject page numbers and headline-markers into the text.
Some checks failed
continuous-integration/drone/push Build is failing
Some checks failed
continuous-integration/drone/push Build is failing
The metadata is recorded in position → data pairs. Closes: #13
This commit is contained in:
parent
ebb8b63830
commit
b8431019b7
158
src/search.cpp
158
src/search.cpp
|
@ -27,6 +27,8 @@
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <array>
|
#include <array>
|
||||||
|
#include <iterator>
|
||||||
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <string_view>
|
#include <string_view>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
@ -83,18 +85,18 @@ std::vector<match> search(const fs::path &filepath,
|
||||||
for (const auto &entry : epub_filepaths)
|
for (const auto &entry : epub_filepaths)
|
||||||
{
|
{
|
||||||
DEBUGLOG << "Processing " << entry;
|
DEBUGLOG << "Processing " << entry;
|
||||||
auto document{zip::read_file(filepath, entry)};
|
file_in_epub file;
|
||||||
if (!opts.raw)
|
|
||||||
{
|
{
|
||||||
cleanup_text(document);
|
const auto document{zip::read_file(filepath, entry)};
|
||||||
document = helpers::unescape_html(document);
|
if (!opts.raw)
|
||||||
|
{
|
||||||
|
file = cleanup_text(helpers::unescape_html(document));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
string::const_iterator begin{document.begin()};
|
string::const_iterator begin{file.text.begin()};
|
||||||
string::const_iterator end{document.end()};
|
string::const_iterator end{file.text.end()};
|
||||||
boost::match_results<string::const_iterator> match_result;
|
boost::match_results<string::const_iterator> match_result;
|
||||||
string last_headline;
|
|
||||||
string last_page;
|
|
||||||
|
|
||||||
while (boost::regex_search(begin, end, match_result, re,
|
while (boost::regex_search(begin, end, match_result, re,
|
||||||
boost::match_default))
|
boost::match_default))
|
||||||
|
@ -104,18 +106,10 @@ std::vector<match> search(const fs::path &filepath,
|
||||||
match.filepath_inside = entry;
|
match.filepath_inside = entry;
|
||||||
match.text = match_result[0];
|
match.text = match_result[0];
|
||||||
match.context = context(match_result, opts.context);
|
match.context = context(match_result, opts.context);
|
||||||
const auto current_headline{headline(match_result.prefix().str())};
|
const auto pos = static_cast<size_t>(
|
||||||
if (!current_headline.empty())
|
std::distance(begin, match_result[0].begin()));
|
||||||
{
|
match.headline = headline(file, pos);
|
||||||
last_headline = current_headline;
|
match.page = page(file, pos);
|
||||||
}
|
|
||||||
match.headline = last_headline;
|
|
||||||
const auto current_page{page(match_result.prefix().str())};
|
|
||||||
if (!current_page.empty())
|
|
||||||
{
|
|
||||||
last_page = current_page;
|
|
||||||
}
|
|
||||||
match.page = last_page;
|
|
||||||
|
|
||||||
matches.emplace_back(match);
|
matches.emplace_back(match);
|
||||||
begin = match_result[0].second;
|
begin = match_result[0].second;
|
||||||
|
@ -125,75 +119,84 @@ std::vector<match> search(const fs::path &filepath,
|
||||||
return matches;
|
return matches;
|
||||||
}
|
}
|
||||||
|
|
||||||
void cleanup_text(string &text)
|
file_in_epub cleanup_text(const std::string_view text)
|
||||||
{
|
{
|
||||||
|
string output{text};
|
||||||
static const boost::regex re_header_start{"<[hH][1-6]"};
|
static const boost::regex re_header_start{"<[hH][1-6]"};
|
||||||
static const boost::regex re_header_end{"</[hH][1-6]"};
|
static const boost::regex re_header_end{"</[hH][1-6]"};
|
||||||
static const boost::regex re_pagebreak{"[^>]+pagebreak[^>]+"
|
static const boost::regex re_pagebreak{"[^>]+pagebreak[^>]+"
|
||||||
"(title|aria-label)"
|
"(title|aria-label)"
|
||||||
"=\"([[:alnum:]]+)\""};
|
"=\"([[:alnum:]]+)\""};
|
||||||
|
|
||||||
size_t pos{};
|
// TODO: Make this more efficient, 3 → 1;
|
||||||
while ((pos = text.find('<', pos)) != string::npos)
|
size_t pos{0};
|
||||||
|
while ((pos = output.find('\r', pos)) != string::npos)
|
||||||
{
|
{
|
||||||
auto endpos{text.find('>', pos) + 1};
|
output.erase(pos, 1);
|
||||||
|
}
|
||||||
|
|
||||||
// Mark headlines. We need them later on.
|
pos = 0;
|
||||||
string replacement;
|
while ((pos = output.find_first_of("\n\t", pos)) != string::npos)
|
||||||
if (boost::regex_match(text.substr(pos, 3), re_header_start))
|
{
|
||||||
|
output.replace(pos, 1, " ");
|
||||||
|
}
|
||||||
|
|
||||||
|
pos = 0;
|
||||||
|
while ((pos = output.find(" ", pos)) != string::npos)
|
||||||
|
{
|
||||||
|
output.replace(pos, 2, " ");
|
||||||
|
}
|
||||||
|
|
||||||
|
pos = 0;
|
||||||
|
file_in_epub file;
|
||||||
|
size_t headline_start{string::npos};
|
||||||
|
while ((pos = output.find('<', pos)) != string::npos)
|
||||||
|
{
|
||||||
|
auto endpos{output.find('>', pos) + 1};
|
||||||
|
|
||||||
|
if (boost::regex_match(output.substr(pos, 3), re_header_start))
|
||||||
{
|
{
|
||||||
replacement = "<H>";
|
headline_start = pos;
|
||||||
}
|
}
|
||||||
else if (boost::regex_match(text.substr(pos, 4), re_header_end))
|
else if (boost::regex_match(output.substr(pos, 4), re_header_end))
|
||||||
{
|
{
|
||||||
replacement = "</H>";
|
if (headline_start != string::npos)
|
||||||
|
{
|
||||||
|
file.headlines.insert(
|
||||||
|
{headline_start,
|
||||||
|
output.substr(headline_start, pos - headline_start)});
|
||||||
|
headline_start = string::npos;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if (text.substr(pos, 6) == "<span ")
|
else if (output.substr(pos, 6) == "<span ")
|
||||||
{
|
{
|
||||||
boost::match_results<string::const_iterator> match;
|
boost::match_results<string::const_iterator> match;
|
||||||
using it_size_t = string::const_iterator::difference_type;
|
using it_size_t = string::const_iterator::difference_type;
|
||||||
string::const_iterator begin{text.begin()
|
string::const_iterator begin{output.begin()
|
||||||
+ static_cast<it_size_t>(pos)};
|
+ static_cast<it_size_t>(pos)};
|
||||||
string::const_iterator end{text.begin()
|
string::const_iterator end{output.begin()
|
||||||
+ static_cast<it_size_t>(endpos)};
|
+ static_cast<it_size_t>(endpos)};
|
||||||
|
|
||||||
if (boost::regex_search(begin, end, match, re_pagebreak))
|
if (boost::regex_search(begin, end, match, re_pagebreak))
|
||||||
{
|
{
|
||||||
replacement = format("<PAGE {0:s}>", match[2].str());
|
file.pages.insert({pos, match[2].str()});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (text.substr(pos, 7) == "<style "
|
else if (output.substr(pos, 7) == "<style "
|
||||||
|| text.substr(pos, 8) == "<script ")
|
|| output.substr(pos, 8) == "<script ")
|
||||||
{
|
{
|
||||||
if (text.find("/>", pos) > endpos)
|
if (output.find("/>", pos) > endpos)
|
||||||
{
|
{
|
||||||
endpos = text.find('>', endpos) + 1;
|
endpos = output.find('>', endpos) + 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
DEBUGLOG << "Replacing '" << text.substr(pos, endpos - pos)
|
output.erase(pos, endpos - pos);
|
||||||
<< "' with '" << replacement << "'.";
|
|
||||||
text.replace(pos, endpos - pos, replacement);
|
|
||||||
pos += replacement.length();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pos = 0;
|
file.text = output;
|
||||||
while ((pos = text.find('\r', pos)) != string::npos)
|
|
||||||
{
|
|
||||||
text.erase(pos, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
pos = 0;
|
return file;
|
||||||
while ((pos = text.find_first_of("\n\t", pos)) != string::npos)
|
|
||||||
{
|
|
||||||
text.replace(pos, 1, " ");
|
|
||||||
}
|
|
||||||
|
|
||||||
pos = 0;
|
|
||||||
while ((pos = text.find(" ", pos)) != string::npos)
|
|
||||||
{
|
|
||||||
text.replace(pos, 2, " ");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
match_context context(const boost::match_results<string::const_iterator> &match,
|
match_context context(const boost::match_results<string::const_iterator> &match,
|
||||||
|
@ -264,39 +267,36 @@ match_context context(const boost::match_results<string::const_iterator> &match,
|
||||||
return {before, after};
|
return {before, after};
|
||||||
}
|
}
|
||||||
|
|
||||||
string headline(const std::string_view prefix)
|
std::string headline(const file_in_epub &file, const size_t pos)
|
||||||
{
|
{
|
||||||
size_t pos{prefix.length()};
|
std::string_view last;
|
||||||
if ((pos = prefix.rfind("<H>", pos)) != std::string_view::npos)
|
|
||||||
|
for (const auto &pair : file.headlines)
|
||||||
{
|
{
|
||||||
pos += 3;
|
if (pair.first > pos)
|
||||||
string result{prefix.substr(pos, prefix.find('<', pos) - pos)};
|
|
||||||
|
|
||||||
while (helpers::is_whitespace(*result.begin()))
|
|
||||||
{
|
{
|
||||||
result.erase(0, 1);
|
break;
|
||||||
}
|
}
|
||||||
while (helpers::is_whitespace(*result.rbegin()))
|
last = pair.second;
|
||||||
{
|
|
||||||
result.erase(result.size() - 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return {};
|
return last.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
string page(const std::string_view prefix)
|
string page(const file_in_epub &file, const size_t pos)
|
||||||
{
|
{
|
||||||
size_t pos{prefix.length()};
|
std::string_view last;
|
||||||
while ((pos = prefix.rfind("<PAGE ", pos)) != std::string_view::npos)
|
|
||||||
|
for (const auto &pair : file.pages)
|
||||||
{
|
{
|
||||||
pos += 6;
|
if (pair.first > pos)
|
||||||
return string{prefix.substr(pos, prefix.find('>', pos) - pos)};
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
last = pair.second;
|
||||||
}
|
}
|
||||||
|
|
||||||
return {};
|
return last.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace epubgrep::search
|
} // namespace epubgrep::search
|
||||||
|
|
|
@ -22,7 +22,9 @@
|
||||||
|
|
||||||
#include <boost/regex.hpp>
|
#include <boost/regex.hpp>
|
||||||
|
|
||||||
|
#include <cstddef>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
#include <map>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <string_view>
|
#include <string_view>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
@ -52,13 +54,20 @@ struct settings
|
||||||
std::uint64_t context{0};
|
std::uint64_t context{0};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct file_in_epub
|
||||||
|
{
|
||||||
|
std::string text;
|
||||||
|
std::map<size_t, std::string> headlines;
|
||||||
|
std::map<size_t, std::string> pages;
|
||||||
|
};
|
||||||
|
|
||||||
//! Search file, return matches.
|
//! Search file, return matches.
|
||||||
[[nodiscard]] std::vector<match> search(const fs::path &filepath,
|
[[nodiscard]] std::vector<match> search(const fs::path &filepath,
|
||||||
std::string_view regex,
|
std::string_view regex,
|
||||||
const settings &opts);
|
const settings &opts);
|
||||||
|
|
||||||
//! Strip HTML, remove newlines, condense spaces.
|
//! Strip HTML, remove newlines, condense spaces.
|
||||||
void cleanup_text(std::string &text);
|
[[nodiscard]] file_in_epub cleanup_text(std::string_view text);
|
||||||
|
|
||||||
//! Return words before and after the match.
|
//! Return words before and after the match.
|
||||||
[[nodiscard]] match_context
|
[[nodiscard]] match_context
|
||||||
|
@ -66,10 +75,10 @@ context(const boost::match_results<std::string::const_iterator> &match,
|
||||||
std::uint64_t words);
|
std::uint64_t words);
|
||||||
|
|
||||||
//! Return last headline if possible.
|
//! Return last headline if possible.
|
||||||
[[nodiscard]] std::string headline(std::string_view prefix);
|
[[nodiscard]] std::string headline(const file_in_epub &file, size_t pos);
|
||||||
|
|
||||||
//! Return current page if possible.
|
//! Return current page if possible.
|
||||||
[[nodiscard]] std::string page(std::string_view prefix);
|
[[nodiscard]] std::string page(const file_in_epub &file, size_t pos);
|
||||||
|
|
||||||
} // namespace epubgrep::search
|
} // namespace epubgrep::search
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user