Only search files in spine, in the right order.

The spine lists all content documents in their linear reading order. So we're finally getting our results in the right order! 🎉 Since we skip the images and fonts, which usually make up the most bytes in an EPUB file, the performance increase is immense. I measured 60-70% in a very short test. Closes: #1
2021-05-29 15:50:03 +02:00 · 2021-05-29 15:50:03 +02:00 · 00e3edb9f2
commit 00e3edb9f2
parent c94d9de0db
9 changed files with 147 additions and 24 deletions
--- a/.drone.yml
+++ b/.drone.yml
@ -26,7 +26,7 @@ steps:
  - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
  - apt-get update -q
  - apt-get install -qq build-essential cmake clang locales
-  - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc
+  - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
  - rm -rf build && mkdir -p build && cd build
  - cmake -G "Unix Makefiles" -DWITH_TESTS=YES ..
  - make VERBOSE=1
@ -63,7 +63,7 @@ steps:
  - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
  - apt-get update -q
  - apt-get install -qq g++-8 build-essential clang locales
-  - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc
+  - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
  - sh cmake_installer.sh --skip-license --exclude-subdir --prefix=/usr/local
  - rm -rf build && mkdir -p build && cd build
  - cmake -G "Unix Makefiles" -DWITH_TESTS=YES ..
@ -120,7 +120,7 @@ steps:
  - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
  - apt-get update -q
  - apt-get install -qq build-essential cmake clang locales lsb-release
-  - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc
+  - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
  - rm -rf build && mkdir -p build && cd build
  - cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr ..
  - make VERBOSE=1
@ -144,7 +144,7 @@ steps:
  - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
  - apt-get update -q
  - apt-get install -qq build-essential cmake clang locales lsb-release
-  - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc
+  - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
  - rm -rf build && mkdir -p build && cd build
  - cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr ..
  - make VERBOSE=1
@ -176,7 +176,7 @@ steps:
  - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
  - apt-get update -q
  - apt-get install -qq g++-8 build-essential clang locales lsb-release
-  - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc
+  - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
  - sh cmake_installer.sh --skip-license --exclude-subdir --prefix=/usr/local
  - rm -rf build && mkdir -p build && cd build
  - cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr ..
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -39,6 +39,7 @@ if(NOT termcolor_FOUND)
  endif()
 endif()
 find_package(Threads REQUIRED)
+find_package(pugixml 1 REQUIRED CONFIG)

 add_subdirectory(src)

--- a/README.adoc
+++ b/README.adoc
@ -15,6 +15,7 @@
 :uri-fmt: https://github.com/fmtlib/fmt
 :uri-asciidoc: http://asciidoc.org/
 :uri-termcolor: https://termcolor.readthedocs.io/
+:uri-pugixml: https://pugixml.org/

 :license: https://schlomp.space/tastytea/{project}/src/branch/main/LICENSE
 :license-termcolor: https://schlomp.space/tastytea/{project}/src/branch/main/dist/termcolor/LICENSE
@ -84,6 +85,7 @@ If you get the error message that `add-apt-repository` was not found, install
 * link:{uri-asciidoc}[AsciiDoc] (tested: 9.0 / 8.6)
 * link:{uri-termcolor}[Termcolor] (tested: 2.0) (If not found, the bundled
  version is used.)
+* link:{uri-pugixml}[pugixml] (tested: 1.11 / 1.8)
 * Optional
  ** Tests: link:{uri-catch}[Catch] (tested: 2.13 / 1.10)

@ -95,7 +97,9 @@ of CMake.

 [source,shell]
 --------------------------------------------------------------------------------
-sudo apt install build-essential cmake libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc
+sudo apt install build-essential cmake libboost-program-options-dev \
+                 libboost-locale-dev libboost-regex-dev gettext libarchive-dev \
+                 libfmt-dev asciidoc libpugixml-dev
 --------------------------------------------------------------------------------

 ==== Get sourcecode
--- a/man/epubgrep.1.adoc
+++ b/man/epubgrep.1.adoc
@ -2,7 +2,7 @@
 :doctype:       manpage
 :Author:        tastytea
 :Email:         tastytea@tastytea.de
-:Date:          2021-05-28
+:Date:          2021-05-29
 :Revision:      0.0.0
 :man source:    epubgrep
 :man manual:    General Commands Manual
@ -50,7 +50,8 @@ Ignore case distinctions in pattern and data.
 Use additional _PATTERN_ for matching. Can be used more than once.

 *-a*, *--raw*::
-Do not clean up text before searching. No HTML stripping, no newline removal.
+Do not clean up text before searching. No HTML stripping, no newline removal,
+all files will be read (not just the text documents listed in the spine).

 *-C* _NUMBER_, *context* _NUMBER_::
 Print _NUMBER_ words of context around matches.
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -24,7 +24,8 @@ target_link_libraries(${PROJECT_NAME}_lib
  fmt::fmt
  termcolor::termcolor
  Threads::Threads
-  m)
+  m
+  pugixml)

 if(${CMAKE_VERSION} VERSION_LESS 3.17)
  target_link_libraries(${PROJECT_NAME}_lib
--- a/src/search.cpp
+++ b/src/search.cpp
@ -62,7 +62,16 @@ std::vector<match> search(const fs::path &filepath,

    const boost::regex re(regex.data(), flags);
    std::vector<match> matches;
-    for (const auto &entry : zip::list(filepath))
+    std::vector<string> epub_filepaths{[&opts, &filepath]
+                                       {
+                                           if (!opts.raw)
+                                           {
+                                               return zip::list_spine(filepath);
+                                           }
+                                           return zip::list(filepath);
+                                       }()};
+
+    for (const auto &entry : epub_filepaths)
    {
        auto document{zip::read_file(filepath, entry)};
        if (!opts.raw)
--- a/src/zip.cpp
+++ b/src/zip.cpp
@ -23,6 +23,7 @@
 #include <boost/locale/message.hpp>
 #include <fmt/format.h>
 #include <fmt/ostream.h> // For compatibility with fmt 4.
+#include <pugixml.hpp>

 #include <cstdlib>
 #include <cstring>
@ -30,6 +31,7 @@
 #include <iostream>
 #include <stdexcept>
 #include <string>
+#include <string_view>
 #include <vector>

 namespace epubgrep::zip
@ -50,12 +52,12 @@ std::vector<std::string> list(const fs::path &filepath)
        if (in_epub_filepath == nullptr)
        { // If the encoding is broken, we skip the file.
            std::cerr << translate("WARNING: ")
-                      << format(translate("{0:s} is damaged. "
-                                          "Skipping rest of file.\n")
+                      << format(translate("File in {0:s} is damaged. "
+                                          "Skipping in-EPUB file.\n")
                                    .str()
                                    .data(),
                                filepath);
-            break;
+            continue;
        }
        toc.emplace_back(in_epub_filepath);
        archive_read_data_skip(zipfile);
@ -74,6 +76,16 @@ std::string read_file(const fs::path &filepath, std::string_view entry_path)
    while (archive_read_next_header(zipfile, &entry) == ARCHIVE_OK)
    {
        const auto *path{archive_entry_pathname_utf8(entry)};
+        if (path == nullptr)
+        { // If the encoding is broken, we skip the file.
+            std::cerr << translate("WARNING: ")
+                      << format(translate("File in {0:s} is damaged. "
+                                          "Skipping in-EPUB file.\n")
+                                    .str()
+                                    .data(),
+                                filepath);
+            continue;
+        }
        if (std::strcmp(path, entry_path.data()) == 0)
        {
            const auto length{static_cast<size_t>(archive_entry_size(entry))};
@ -140,4 +152,85 @@ void close_file(struct archive *zipfile, const fs::path &filepath)
    }
 }

+std::vector<std::string> list_spine(const fs::path &filepath)
+{
+    const fs::path opf_file_path{
+        [&filepath]
+        {
+            pugi::xml_document xml;
+            const std::string container{
+                read_file(filepath, "META-INF/container.xml")};
+            const auto result{xml.load_buffer(&container[0], container.size())};
+            if (result)
+            {
+                return xml.child("container")
+                    .child("rootfiles")
+                    .first_child()
+                    .attribute("full-path")
+                    .value();
+            }
+
+            return "";
+        }()};
+
+    std::vector<std::string> spine_filepaths;
+    if (!opf_file_path.empty())
+    {
+        pugi::xml_document xml;
+        const std::string opf_file{read_file(filepath, opf_file_path.string())};
+        const auto result{xml.load_buffer(&opf_file[0], opf_file.size())};
+        if (result)
+        {
+            auto manifest{xml.child("package").child("manifest")};
+            auto spine{xml.child("package").child("spine")};
+
+            for (const auto &itemref : spine)
+            {
+                const auto &idref{itemref.attribute("idref").value()};
+                const auto &item{manifest.find_child_by_attribute("id", idref)};
+                const std::string href{
+                    urldecode(item.attribute("href").value())};
+                if (href[0] != '/')
+                {
+                    spine_filepaths.emplace_back(
+                        opf_file_path.parent_path() /= href);
+                    continue;
+                }
+                spine_filepaths.emplace_back(href);
+            }
+        }
+    }
+
+    if (opf_file_path.empty() || spine_filepaths.empty())
+    {
+        std::cerr << translate("ERROR: ")
+                  << format(translate("{0:s} is damaged. Could not read spine. "
+                                      "Skipping file.\n")
+                                .str()
+                                .data(),
+                            filepath);
+        return {};
+    }
+
+    return spine_filepaths;
+}
+
+std::string urldecode(const std::string_view url)
+{ // RFC 3986, section 2.1.
+    size_t pos{0};
+    size_t lastpos{0};
+    std::string decoded;
+    while ((pos = url.find('%', pos)) != std::string_view::npos)
+    {
+        decoded += url.substr(lastpos, pos - lastpos);
+        decoded += static_cast<char>(
+            std::stoul(std::string(url.substr(pos + 1, 2)), nullptr, 16));
+        pos += 3;
+        lastpos = pos;
+    }
+    decoded += url.substr(lastpos);
+
+    return decoded;
+}
+
 } // namespace epubgrep::zip
--- a/src/zip.hpp
+++ b/src/zip.hpp
@ -43,6 +43,12 @@ namespace epubgrep::zip
 //! Close zip file.
 void close_file(struct archive *zipfile, const fs::path &filepath);

+//! Returns the files in the EPUB “spine” (all pages that are actually text).
+[[nodiscard]] std::vector<std::string> list_spine(const fs::path &filepath);
+
+//! Decode percent-encoding. Used for restricted characters in URLs.
+[[nodiscard]] std::string urldecode(std::string_view url);
+
 //! It's std::runtime_error, but with another name.
 class exception : public std::runtime_error
 {
--- a/tests/test_search_zip.cpp
+++ b/tests/test_search_zip.cpp
@ -23,6 +23,7 @@ SCENARIO("Searching works")
        {
            std::vector<epubgrep::search::match> matches;
            epubgrep::search::settings opts;
+            opts.raw = true;

            WHEN("We search for ‘📙+\\w?’ using extended regular expressions")
            {
@ -63,17 +64,19 @@ SCENARIO("Searching works")
                    REQUIRE_FALSE(exception);
                    REQUIRE(matches.at(0).filepath == "test folder/😊");
                    REQUIRE(matches.at(0).text == "📗");
-                    REQUIRE(matches.at(0).context.first == "📖 📘");
-                    REQUIRE(matches.at(0).context.second == "📙 ");
+                    REQUIRE(matches.at(0).context.first == "📖\n\n📘");
+                    REQUIRE(matches.at(0).context.second == "📙\n");
                }
            }

-            WHEN("We search for ‘ ’ (space) with context = 1.")
+            WHEN("We search for ‘[ \\n]’ with context = 1.")
            {
                try
                {
                    opts.context = 1;
-                    matches = epubgrep::search::search(zipfile, " ", opts);
+                    opts.regex = epubgrep::options::regex_kind::perl;
+                    matches = epubgrep::search::search(zipfile, R"([ \n])",
+                                                       opts);
                }
                catch (const std::exception &)
                {
@ -83,19 +86,24 @@ SCENARIO("Searching works")
                THEN("No exception is thrown")
                AND_THEN("It returns the match correctly")
                {
+                    // I looked at this a week or so after I've written it, and
+                    // I have come to the realization that this is a tiny bit
+                    // more complicated than strictly required. 😄
+                    // TODO: Rewrite test.zip and tests to be better
+                    // understandable.
                    REQUIRE_FALSE(exception);
                    REQUIRE(matches.at(1).filepath == "test folder/test file");
                    REQUIRE(matches.at(1).text == " ");
                    REQUIRE(matches.at(1).context.first == "don't");
                    REQUIRE(matches.at(1).context.second == "want to");
                    REQUIRE(matches.at(10).filepath == "test folder/😊");
-                    REQUIRE(matches.at(10).text == " ");
+                    REQUIRE(matches.at(10).text == "\n");
                    REQUIRE(matches.at(10).context.first == "📖");
-                    REQUIRE(matches.at(10).context.second == "📘📗📙 ");
-                    REQUIRE(matches.at(11).filepath == "test folder/😊");
-                    REQUIRE(matches.at(11).text == " ");
-                    REQUIRE(matches.at(11).context.first == "📘📗📙");
-                    REQUIRE(matches.at(11).context.second == "");
+                    REQUIRE(matches.at(10).context.second == "\n📘📗📙\n");
+                    REQUIRE(matches.at(12).filepath == "test folder/😊");
+                    REQUIRE(matches.at(12).text == "\n");
+                    REQUIRE(matches.at(12).context.first == "📘📗📙");
+                    REQUIRE(matches.at(12).context.second.empty());
                }
            }

@ -119,7 +127,7 @@ SCENARIO("Searching works")
                {
                    REQUIRE_FALSE(exception);
                    REQUIRE(matches.at(0).filepath == "test folder/test file");
-                    REQUIRE(matches.at(0).text == "work today. I'm stay");
+                    REQUIRE(matches.at(0).text == "work today.\nI'm stay");
                    REQUIRE(matches.at(0).context.first == "to ");
                    REQUIRE(matches.at(0).context.second == "ing in");
                }