Only search files in spine, in the right order.

The spine lists all content documents in their linear reading order. So we're
finally getting our results in the right order! 🎉

Since we skip the images and fonts, which usually make up the most bytes in an
EPUB file, the performance increase is immense. I measured 60-70% in a very
short test.

Closes: #1
This commit is contained in:
tastytea 2021-05-29 15:50:03 +02:00
parent c94d9de0db
commit 00e3edb9f2
Signed by: tastytea
GPG Key ID: CFC39497F1B26E07
9 changed files with 147 additions and 24 deletions

View File

@ -26,7 +26,7 @@ steps:
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
- apt-get update -q
- apt-get install -qq build-essential cmake clang locales
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
- rm -rf build && mkdir -p build && cd build
- cmake -G "Unix Makefiles" -DWITH_TESTS=YES ..
- make VERBOSE=1
@ -63,7 +63,7 @@ steps:
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
- apt-get update -q
- apt-get install -qq g++-8 build-essential clang locales
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
- sh cmake_installer.sh --skip-license --exclude-subdir --prefix=/usr/local
- rm -rf build && mkdir -p build && cd build
- cmake -G "Unix Makefiles" -DWITH_TESTS=YES ..
@ -120,7 +120,7 @@ steps:
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
- apt-get update -q
- apt-get install -qq build-essential cmake clang locales lsb-release
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
- rm -rf build && mkdir -p build && cd build
- cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr ..
- make VERBOSE=1
@ -144,7 +144,7 @@ steps:
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
- apt-get update -q
- apt-get install -qq build-essential cmake clang locales lsb-release
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
- rm -rf build && mkdir -p build && cd build
- cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr ..
- make VERBOSE=1
@ -176,7 +176,7 @@ steps:
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
- apt-get update -q
- apt-get install -qq g++-8 build-essential clang locales lsb-release
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
- sh cmake_installer.sh --skip-license --exclude-subdir --prefix=/usr/local
- rm -rf build && mkdir -p build && cd build
- cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr ..

View File

@ -39,6 +39,7 @@ if(NOT termcolor_FOUND)
endif()
endif()
find_package(Threads REQUIRED)
find_package(pugixml 1 REQUIRED CONFIG)
add_subdirectory(src)

View File

@ -15,6 +15,7 @@
:uri-fmt: https://github.com/fmtlib/fmt
:uri-asciidoc: http://asciidoc.org/
:uri-termcolor: https://termcolor.readthedocs.io/
:uri-pugixml: https://pugixml.org/
:license: https://schlomp.space/tastytea/{project}/src/branch/main/LICENSE
:license-termcolor: https://schlomp.space/tastytea/{project}/src/branch/main/dist/termcolor/LICENSE
@ -84,6 +85,7 @@ If you get the error message that `add-apt-repository` was not found, install
* link:{uri-asciidoc}[AsciiDoc] (tested: 9.0 / 8.6)
* link:{uri-termcolor}[Termcolor] (tested: 2.0) (If not found, the bundled
version is used.)
* link:{uri-pugixml}[pugixml] (tested: 1.11 / 1.8)
* Optional
** Tests: link:{uri-catch}[Catch] (tested: 2.13 / 1.10)
@ -95,7 +97,9 @@ of CMake.
[source,shell]
--------------------------------------------------------------------------------
sudo apt install build-essential cmake libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc
sudo apt install build-essential cmake libboost-program-options-dev \
libboost-locale-dev libboost-regex-dev gettext libarchive-dev \
libfmt-dev asciidoc libpugixml-dev
--------------------------------------------------------------------------------
==== Get sourcecode

View File

@ -2,7 +2,7 @@
:doctype: manpage
:Author: tastytea
:Email: tastytea@tastytea.de
:Date: 2021-05-28
:Date: 2021-05-29
:Revision: 0.0.0
:man source: epubgrep
:man manual: General Commands Manual
@ -50,7 +50,8 @@ Ignore case distinctions in pattern and data.
Use additional _PATTERN_ for matching. Can be used more than once.
*-a*, *--raw*::
Do not clean up text before searching. No HTML stripping, no newline removal.
Do not clean up text before searching. No HTML stripping, no newline removal,
all files will be read (not just the text documents listed in the spine).
*-C* _NUMBER_, *context* _NUMBER_::
Print _NUMBER_ words of context around matches.

View File

@ -24,7 +24,8 @@ target_link_libraries(${PROJECT_NAME}_lib
fmt::fmt
termcolor::termcolor
Threads::Threads
m)
m
pugixml)
if(${CMAKE_VERSION} VERSION_LESS 3.17)
target_link_libraries(${PROJECT_NAME}_lib

View File

@ -62,7 +62,16 @@ std::vector<match> search(const fs::path &filepath,
const boost::regex re(regex.data(), flags);
std::vector<match> matches;
for (const auto &entry : zip::list(filepath))
std::vector<string> epub_filepaths{[&opts, &filepath]
{
if (!opts.raw)
{
return zip::list_spine(filepath);
}
return zip::list(filepath);
}()};
for (const auto &entry : epub_filepaths)
{
auto document{zip::read_file(filepath, entry)};
if (!opts.raw)

View File

@ -23,6 +23,7 @@
#include <boost/locale/message.hpp>
#include <fmt/format.h>
#include <fmt/ostream.h> // For compatibility with fmt 4.
#include <pugixml.hpp>
#include <cstdlib>
#include <cstring>
@ -30,6 +31,7 @@
#include <iostream>
#include <stdexcept>
#include <string>
#include <string_view>
#include <vector>
namespace epubgrep::zip
@ -50,12 +52,12 @@ std::vector<std::string> list(const fs::path &filepath)
if (in_epub_filepath == nullptr)
{ // If the encoding is broken, we skip the file.
std::cerr << translate("WARNING: ")
<< format(translate("{0:s} is damaged. "
"Skipping rest of file.\n")
<< format(translate("File in {0:s} is damaged. "
"Skipping in-EPUB file.\n")
.str()
.data(),
filepath);
break;
continue;
}
toc.emplace_back(in_epub_filepath);
archive_read_data_skip(zipfile);
@ -74,6 +76,16 @@ std::string read_file(const fs::path &filepath, std::string_view entry_path)
while (archive_read_next_header(zipfile, &entry) == ARCHIVE_OK)
{
const auto *path{archive_entry_pathname_utf8(entry)};
if (path == nullptr)
{ // If the encoding is broken, we skip the file.
std::cerr << translate("WARNING: ")
<< format(translate("File in {0:s} is damaged. "
"Skipping in-EPUB file.\n")
.str()
.data(),
filepath);
continue;
}
if (std::strcmp(path, entry_path.data()) == 0)
{
const auto length{static_cast<size_t>(archive_entry_size(entry))};
@ -140,4 +152,85 @@ void close_file(struct archive *zipfile, const fs::path &filepath)
}
}
std::vector<std::string> list_spine(const fs::path &filepath)
{
const fs::path opf_file_path{
[&filepath]
{
pugi::xml_document xml;
const std::string container{
read_file(filepath, "META-INF/container.xml")};
const auto result{xml.load_buffer(&container[0], container.size())};
if (result)
{
return xml.child("container")
.child("rootfiles")
.first_child()
.attribute("full-path")
.value();
}
return "";
}()};
std::vector<std::string> spine_filepaths;
if (!opf_file_path.empty())
{
pugi::xml_document xml;
const std::string opf_file{read_file(filepath, opf_file_path.string())};
const auto result{xml.load_buffer(&opf_file[0], opf_file.size())};
if (result)
{
auto manifest{xml.child("package").child("manifest")};
auto spine{xml.child("package").child("spine")};
for (const auto &itemref : spine)
{
const auto &idref{itemref.attribute("idref").value()};
const auto &item{manifest.find_child_by_attribute("id", idref)};
const std::string href{
urldecode(item.attribute("href").value())};
if (href[0] != '/')
{
spine_filepaths.emplace_back(
opf_file_path.parent_path() /= href);
continue;
}
spine_filepaths.emplace_back(href);
}
}
}
if (opf_file_path.empty() || spine_filepaths.empty())
{
std::cerr << translate("ERROR: ")
<< format(translate("{0:s} is damaged. Could not read spine. "
"Skipping file.\n")
.str()
.data(),
filepath);
return {};
}
return spine_filepaths;
}
std::string urldecode(const std::string_view url)
{ // RFC 3986, section 2.1.
size_t pos{0};
size_t lastpos{0};
std::string decoded;
while ((pos = url.find('%', pos)) != std::string_view::npos)
{
decoded += url.substr(lastpos, pos - lastpos);
decoded += static_cast<char>(
std::stoul(std::string(url.substr(pos + 1, 2)), nullptr, 16));
pos += 3;
lastpos = pos;
}
decoded += url.substr(lastpos);
return decoded;
}
} // namespace epubgrep::zip

View File

@ -43,6 +43,12 @@ namespace epubgrep::zip
//! Close zip file.
void close_file(struct archive *zipfile, const fs::path &filepath);
//! Returns the files in the EPUB “spine” (all pages that are actually text).
[[nodiscard]] std::vector<std::string> list_spine(const fs::path &filepath);
//! Decode percent-encoding. Used for restricted characters in URLs.
[[nodiscard]] std::string urldecode(std::string_view url);
//! It's std::runtime_error, but with another name.
class exception : public std::runtime_error
{

View File

@ -23,6 +23,7 @@ SCENARIO("Searching works")
{
std::vector<epubgrep::search::match> matches;
epubgrep::search::settings opts;
opts.raw = true;
WHEN("We search for ‘📙+\\w? using extended regular expressions")
{
@ -63,17 +64,19 @@ SCENARIO("Searching works")
REQUIRE_FALSE(exception);
REQUIRE(matches.at(0).filepath == "test folder/😊");
REQUIRE(matches.at(0).text == "📗");
REQUIRE(matches.at(0).context.first == "📖 📘");
REQUIRE(matches.at(0).context.second == "📙 ");
REQUIRE(matches.at(0).context.first == "📖\n\n📘");
REQUIRE(matches.at(0).context.second == "📙\n");
}
}
WHEN("We search for (space) with context = 1.")
WHEN("We search for [ \\n] with context = 1.")
{
try
{
opts.context = 1;
matches = epubgrep::search::search(zipfile, " ", opts);
opts.regex = epubgrep::options::regex_kind::perl;
matches = epubgrep::search::search(zipfile, R"([ \n])",
opts);
}
catch (const std::exception &)
{
@ -83,19 +86,24 @@ SCENARIO("Searching works")
THEN("No exception is thrown")
AND_THEN("It returns the match correctly")
{
// I looked at this a week or so after I've written it, and
// I have come to the realization that this is a tiny bit
// more complicated than strictly required. 😄
// TODO: Rewrite test.zip and tests to be better
// understandable.
REQUIRE_FALSE(exception);
REQUIRE(matches.at(1).filepath == "test folder/test file");
REQUIRE(matches.at(1).text == " ");
REQUIRE(matches.at(1).context.first == "don't");
REQUIRE(matches.at(1).context.second == "want to");
REQUIRE(matches.at(10).filepath == "test folder/😊");
REQUIRE(matches.at(10).text == " ");
REQUIRE(matches.at(10).text == "\n");
REQUIRE(matches.at(10).context.first == "📖");
REQUIRE(matches.at(10).context.second == "📘📗📙 ");
REQUIRE(matches.at(11).filepath == "test folder/😊");
REQUIRE(matches.at(11).text == " ");
REQUIRE(matches.at(11).context.first == "📘📗📙");
REQUIRE(matches.at(11).context.second == "");
REQUIRE(matches.at(10).context.second == "\n📘📗📙\n");
REQUIRE(matches.at(12).filepath == "test folder/😊");
REQUIRE(matches.at(12).text == "\n");
REQUIRE(matches.at(12).context.first == "📘📗📙");
REQUIRE(matches.at(12).context.second.empty());
}
}
@ -119,7 +127,7 @@ SCENARIO("Searching works")
{
REQUIRE_FALSE(exception);
REQUIRE(matches.at(0).filepath == "test folder/test file");
REQUIRE(matches.at(0).text == "work today. I'm stay");
REQUIRE(matches.at(0).text == "work today.\nI'm stay");
REQUIRE(matches.at(0).context.first == "to ");
REQUIRE(matches.at(0).context.second == "ing in");
}