From 00e3edb9f20249cc3608ca10b71e39d1ecc76297 Mon Sep 17 00:00:00 2001
From: tastytea <tastytea@tastytea.de>
Date: Sat, 29 May 2021 15:50:03 +0200
Subject: [PATCH] Only search files in spine, in the right order.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The spine lists all content documents in their linear reading order. So we're
finally getting our results in the right order! 🎉

Since we skip the images and fonts, which usually make up the most bytes in an
EPUB file, the performance increase is immense. I measured 60-70% in a very
short test.

Closes: https://schlomp.space/tastytea/epubgrep/issues/1
---
 .drone.yml                                    | 10 +-
 CMakeLists.txt                                |  1 +
 README.adoc                                   |  6 +-
 man/epubgrep.1.adoc                           |  5 +-
 src/CMakeLists.txt                            |  3 +-
 src/search.cpp                                | 11 ++-
 src/zip.cpp                                   | 99 ++++++++++++++++++-
 src/zip.hpp                                   |  6 ++
 .../{test_search.cpp => test_search_zip.cpp}  | 30 +++---
 9 files changed, 147 insertions(+), 24 deletions(-)
 rename tests/{test_search.cpp => test_search_zip.cpp} (79%)

diff --git a/.drone.yml b/.drone.yml
index 3493d03..6d84804 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -26,7 +26,7 @@ steps:
   - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
   - apt-get update -q
   - apt-get install -qq build-essential cmake clang locales
-  - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc
+  - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
   - rm -rf build && mkdir -p build && cd build
   - cmake -G "Unix Makefiles" -DWITH_TESTS=YES ..
   - make VERBOSE=1
@@ -63,7 +63,7 @@ steps:
   - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
   - apt-get update -q
   - apt-get install -qq g++-8 build-essential clang locales
-  - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc
+  - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
   - sh cmake_installer.sh --skip-license --exclude-subdir --prefix=/usr/local
   - rm -rf build && mkdir -p build && cd build
   - cmake -G "Unix Makefiles" -DWITH_TESTS=YES ..
@@ -120,7 +120,7 @@ steps:
   - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
   - apt-get update -q
   - apt-get install -qq build-essential cmake clang locales lsb-release
-  - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc
+  - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
   - rm -rf build && mkdir -p build && cd build
   - cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr ..
   - make VERBOSE=1
@@ -144,7 +144,7 @@ steps:
   - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
   - apt-get update -q
   - apt-get install -qq build-essential cmake clang locales lsb-release
-  - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc
+  - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
   - rm -rf build && mkdir -p build && cd build
   - cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr ..
   - make VERBOSE=1
@@ -176,7 +176,7 @@ steps:
   - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
   - apt-get update -q
   - apt-get install -qq g++-8 build-essential clang locales lsb-release
-  - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc
+  - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
   - sh cmake_installer.sh --skip-license --exclude-subdir --prefix=/usr/local
   - rm -rf build && mkdir -p build && cd build
   - cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr ..
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 867e09e..79fa19e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,6 +39,7 @@ if(NOT termcolor_FOUND)
   endif()
 endif()
 find_package(Threads REQUIRED)
+find_package(pugixml 1 REQUIRED CONFIG)
 
 add_subdirectory(src)
 
diff --git a/README.adoc b/README.adoc
index e5fb7ff..daaceb7 100644
--- a/README.adoc
+++ b/README.adoc
@@ -15,6 +15,7 @@
 :uri-fmt: https://github.com/fmtlib/fmt
 :uri-asciidoc: http://asciidoc.org/
 :uri-termcolor: https://termcolor.readthedocs.io/
+:uri-pugixml: https://pugixml.org/
 
 :license: https://schlomp.space/tastytea/{project}/src/branch/main/LICENSE
 :license-termcolor: https://schlomp.space/tastytea/{project}/src/branch/main/dist/termcolor/LICENSE
@@ -84,6 +85,7 @@ If you get the error message that `add-apt-repository` was not found, install
 * link:{uri-asciidoc}[AsciiDoc] (tested: 9.0 / 8.6)
 * link:{uri-termcolor}[Termcolor] (tested: 2.0) (If not found, the bundled
   version is used.)
+* link:{uri-pugixml}[pugixml] (tested: 1.11 / 1.8)
 * Optional
   ** Tests: link:{uri-catch}[Catch] (tested: 2.13 / 1.10)
 
@@ -95,7 +97,9 @@ of CMake.
 
 [source,shell]
 --------------------------------------------------------------------------------
-sudo apt install build-essential cmake libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc
+sudo apt install build-essential cmake libboost-program-options-dev \
+                 libboost-locale-dev libboost-regex-dev gettext libarchive-dev \
+                 libfmt-dev asciidoc libpugixml-dev
 --------------------------------------------------------------------------------
 
 ==== Get sourcecode
diff --git a/man/epubgrep.1.adoc b/man/epubgrep.1.adoc
index 6fafdba..c07fb31 100644
--- a/man/epubgrep.1.adoc
+++ b/man/epubgrep.1.adoc
@@ -2,7 +2,7 @@
 :doctype:       manpage
 :Author:        tastytea
 :Email:         tastytea@tastytea.de
-:Date:          2021-05-28
+:Date:          2021-05-29
 :Revision:      0.0.0
 :man source:    epubgrep
 :man manual:    General Commands Manual
@@ -50,7 +50,8 @@ Ignore case distinctions in pattern and data.
 Use additional _PATTERN_ for matching. Can be used more than once.
 
 *-a*, *--raw*::
-Do not clean up text before searching. No HTML stripping, no newline removal.
+Do not clean up text before searching. No HTML stripping, no newline removal,
+all files will be read (not just the text documents listed in the spine).
 
 *-C* _NUMBER_, *context* _NUMBER_::
 Print _NUMBER_ words of context around matches.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index df6a810..1bdff15 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -24,7 +24,8 @@ target_link_libraries(${PROJECT_NAME}_lib
   fmt::fmt
   termcolor::termcolor
   Threads::Threads
-  m)
+  m
+  pugixml)
 
 if(${CMAKE_VERSION} VERSION_LESS 3.17)
   target_link_libraries(${PROJECT_NAME}_lib
diff --git a/src/search.cpp b/src/search.cpp
index 050bfbc..aa6d078 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -62,7 +62,16 @@ std::vector<match> search(const fs::path &filepath,
 
     const boost::regex re(regex.data(), flags);
     std::vector<match> matches;
-    for (const auto &entry : zip::list(filepath))
+    std::vector<string> epub_filepaths{[&opts, &filepath]
+                                       {
+                                           if (!opts.raw)
+                                           {
+                                               return zip::list_spine(filepath);
+                                           }
+                                           return zip::list(filepath);
+                                       }()};
+
+    for (const auto &entry : epub_filepaths)
     {
         auto document{zip::read_file(filepath, entry)};
         if (!opts.raw)
diff --git a/src/zip.cpp b/src/zip.cpp
index 67d9b70..d997cda 100644
--- a/src/zip.cpp
+++ b/src/zip.cpp
@@ -23,6 +23,7 @@
 #include <boost/locale/message.hpp>
 #include <fmt/format.h>
 #include <fmt/ostream.h> // For compatibility with fmt 4.
+#include <pugixml.hpp>
 
 #include <cstdlib>
 #include <cstring>
@@ -30,6 +31,7 @@
 #include <iostream>
 #include <stdexcept>
 #include <string>
+#include <string_view>
 #include <vector>
 
 namespace epubgrep::zip
@@ -50,12 +52,12 @@ std::vector<std::string> list(const fs::path &filepath)
         if (in_epub_filepath == nullptr)
         { // If the encoding is broken, we skip the file.
             std::cerr << translate("WARNING: ")
-                      << format(translate("{0:s} is damaged. "
-                                          "Skipping rest of file.\n")
+                      << format(translate("File in {0:s} is damaged. "
+                                          "Skipping in-EPUB file.\n")
                                     .str()
                                     .data(),
                                 filepath);
-            break;
+            continue;
         }
         toc.emplace_back(in_epub_filepath);
         archive_read_data_skip(zipfile);
@@ -74,6 +76,16 @@ std::string read_file(const fs::path &filepath, std::string_view entry_path)
     while (archive_read_next_header(zipfile, &entry) == ARCHIVE_OK)
     {
         const auto *path{archive_entry_pathname_utf8(entry)};
+        if (path == nullptr)
+        { // If the encoding is broken, we skip the file.
+            std::cerr << translate("WARNING: ")
+                      << format(translate("File in {0:s} is damaged. "
+                                          "Skipping in-EPUB file.\n")
+                                    .str()
+                                    .data(),
+                                filepath);
+            continue;
+        }
         if (std::strcmp(path, entry_path.data()) == 0)
         {
             const auto length{static_cast<size_t>(archive_entry_size(entry))};
@@ -140,4 +152,85 @@ void close_file(struct archive *zipfile, const fs::path &filepath)
     }
 }
 
+std::vector<std::string> list_spine(const fs::path &filepath)
+{
+    const fs::path opf_file_path{
+        [&filepath]
+        {
+            pugi::xml_document xml;
+            const std::string container{
+                read_file(filepath, "META-INF/container.xml")};
+            const auto result{xml.load_buffer(&container[0], container.size())};
+            if (result)
+            {
+                return xml.child("container")
+                    .child("rootfiles")
+                    .first_child()
+                    .attribute("full-path")
+                    .value();
+            }
+
+            return "";
+        }()};
+
+    std::vector<std::string> spine_filepaths;
+    if (!opf_file_path.empty())
+    {
+        pugi::xml_document xml;
+        const std::string opf_file{read_file(filepath, opf_file_path.string())};
+        const auto result{xml.load_buffer(&opf_file[0], opf_file.size())};
+        if (result)
+        {
+            auto manifest{xml.child("package").child("manifest")};
+            auto spine{xml.child("package").child("spine")};
+
+            for (const auto &itemref : spine)
+            {
+                const auto &idref{itemref.attribute("idref").value()};
+                const auto &item{manifest.find_child_by_attribute("id", idref)};
+                const std::string href{
+                    urldecode(item.attribute("href").value())};
+                if (href[0] != '/')
+                {
+                    spine_filepaths.emplace_back(
+                        opf_file_path.parent_path() /= href);
+                    continue;
+                }
+                spine_filepaths.emplace_back(href);
+            }
+        }
+    }
+
+    if (opf_file_path.empty() || spine_filepaths.empty())
+    {
+        std::cerr << translate("ERROR: ")
+                  << format(translate("{0:s} is damaged. Could not read spine. "
+                                      "Skipping file.\n")
+                                .str()
+                                .data(),
+                            filepath);
+        return {};
+    }
+
+    return spine_filepaths;
+}
+
+std::string urldecode(const std::string_view url)
+{ // RFC 3986, section 2.1.
+    size_t pos{0};
+    size_t lastpos{0};
+    std::string decoded;
+    while ((pos = url.find('%', pos)) != std::string_view::npos)
+    {
+        decoded += url.substr(lastpos, pos - lastpos);
+        decoded += static_cast<char>(
+            std::stoul(std::string(url.substr(pos + 1, 2)), nullptr, 16));
+        pos += 3;
+        lastpos = pos;
+    }
+    decoded += url.substr(lastpos);
+
+    return decoded;
+}
+
 } // namespace epubgrep::zip
diff --git a/src/zip.hpp b/src/zip.hpp
index e8719a5..c0de8f1 100644
--- a/src/zip.hpp
+++ b/src/zip.hpp
@@ -43,6 +43,12 @@ namespace epubgrep::zip
 //! Close zip file.
 void close_file(struct archive *zipfile, const fs::path &filepath);
 
+//! Returns the files in the EPUB “spine” (all pages that are actually text).
+[[nodiscard]] std::vector<std::string> list_spine(const fs::path &filepath);
+
+//! Decode percent-encoding. Used for restricted characters in URLs.
+[[nodiscard]] std::string urldecode(std::string_view url);
+
 //! It's std::runtime_error, but with another name.
 class exception : public std::runtime_error
 {
diff --git a/tests/test_search.cpp b/tests/test_search_zip.cpp
similarity index 79%
rename from tests/test_search.cpp
rename to tests/test_search_zip.cpp
index d4859d9..b5da7af 100644
--- a/tests/test_search.cpp
+++ b/tests/test_search_zip.cpp
@@ -23,6 +23,7 @@ SCENARIO("Searching works")
         {
             std::vector<epubgrep::search::match> matches;
             epubgrep::search::settings opts;
+            opts.raw = true;
 
             WHEN("We search for ‘📙+\\w?’ using extended regular expressions")
             {
@@ -63,17 +64,19 @@ SCENARIO("Searching works")
                     REQUIRE_FALSE(exception);
                     REQUIRE(matches.at(0).filepath == "test folder/😊");
                     REQUIRE(matches.at(0).text == "📗");
-                    REQUIRE(matches.at(0).context.first == "📖 📘");
-                    REQUIRE(matches.at(0).context.second == "📙 ");
+                    REQUIRE(matches.at(0).context.first == "📖\n\n📘");
+                    REQUIRE(matches.at(0).context.second == "📙\n");
                 }
             }
 
-            WHEN("We search for ‘ ’ (space) with context = 1.")
+            WHEN("We search for ‘[ \\n]’ with context = 1.")
             {
                 try
                 {
                     opts.context = 1;
-                    matches = epubgrep::search::search(zipfile, " ", opts);
+                    opts.regex = epubgrep::options::regex_kind::perl;
+                    matches = epubgrep::search::search(zipfile, R"([ \n])",
+                                                       opts);
                 }
                 catch (const std::exception &)
                 {
@@ -83,19 +86,24 @@ SCENARIO("Searching works")
                 THEN("No exception is thrown")
                 AND_THEN("It returns the match correctly")
                 {
+                    // I looked at this a week or so after I've written it, and
+                    // I have come to the realization that this is a tiny bit
+                    // more complicated than strictly required. 😄
+                    // TODO: Rewrite test.zip and tests to be better
+                    // understandable.
                     REQUIRE_FALSE(exception);
                     REQUIRE(matches.at(1).filepath == "test folder/test file");
                     REQUIRE(matches.at(1).text == " ");
                     REQUIRE(matches.at(1).context.first == "don't");
                     REQUIRE(matches.at(1).context.second == "want to");
                     REQUIRE(matches.at(10).filepath == "test folder/😊");
-                    REQUIRE(matches.at(10).text == " ");
+                    REQUIRE(matches.at(10).text == "\n");
                     REQUIRE(matches.at(10).context.first == "📖");
-                    REQUIRE(matches.at(10).context.second == "📘📗📙 ");
-                    REQUIRE(matches.at(11).filepath == "test folder/😊");
-                    REQUIRE(matches.at(11).text == " ");
-                    REQUIRE(matches.at(11).context.first == "📘📗📙");
-                    REQUIRE(matches.at(11).context.second == "");
+                    REQUIRE(matches.at(10).context.second == "\n📘📗📙\n");
+                    REQUIRE(matches.at(12).filepath == "test folder/😊");
+                    REQUIRE(matches.at(12).text == "\n");
+                    REQUIRE(matches.at(12).context.first == "📘📗📙");
+                    REQUIRE(matches.at(12).context.second.empty());
                 }
             }
 
@@ -119,7 +127,7 @@ SCENARIO("Searching works")
                 {
                     REQUIRE_FALSE(exception);
                     REQUIRE(matches.at(0).filepath == "test folder/test file");
-                    REQUIRE(matches.at(0).text == "work today. I'm stay");
+                    REQUIRE(matches.at(0).text == "work today.\nI'm stay");
                     REQUIRE(matches.at(0).context.first == "to ");
                     REQUIRE(matches.at(0).context.second == "ing in");
                 }