Only search files in spine, in the right order.
The spine lists all content documents in their linear reading order. So we're finally getting our results in the right order! 🎉 Since we skip the images and fonts, which usually make up the most bytes in an EPUB file, the performance increase is immense. I measured 60-70% in a very short test. Closes: #1
This commit is contained in:
parent
c94d9de0db
commit
00e3edb9f2
10
.drone.yml
10
.drone.yml
@ -26,7 +26,7 @@ steps:
|
||||
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
|
||||
- apt-get update -q
|
||||
- apt-get install -qq build-essential cmake clang locales
|
||||
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc
|
||||
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
|
||||
- rm -rf build && mkdir -p build && cd build
|
||||
- cmake -G "Unix Makefiles" -DWITH_TESTS=YES ..
|
||||
- make VERBOSE=1
|
||||
@ -63,7 +63,7 @@ steps:
|
||||
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
|
||||
- apt-get update -q
|
||||
- apt-get install -qq g++-8 build-essential clang locales
|
||||
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc
|
||||
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
|
||||
- sh cmake_installer.sh --skip-license --exclude-subdir --prefix=/usr/local
|
||||
- rm -rf build && mkdir -p build && cd build
|
||||
- cmake -G "Unix Makefiles" -DWITH_TESTS=YES ..
|
||||
@ -120,7 +120,7 @@ steps:
|
||||
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
|
||||
- apt-get update -q
|
||||
- apt-get install -qq build-essential cmake clang locales lsb-release
|
||||
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc
|
||||
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
|
||||
- rm -rf build && mkdir -p build && cd build
|
||||
- cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr ..
|
||||
- make VERBOSE=1
|
||||
@ -144,7 +144,7 @@ steps:
|
||||
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
|
||||
- apt-get update -q
|
||||
- apt-get install -qq build-essential cmake clang locales lsb-release
|
||||
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc
|
||||
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
|
||||
- rm -rf build && mkdir -p build && cd build
|
||||
- cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr ..
|
||||
- make VERBOSE=1
|
||||
@ -176,7 +176,7 @@ steps:
|
||||
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
|
||||
- apt-get update -q
|
||||
- apt-get install -qq g++-8 build-essential clang locales lsb-release
|
||||
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc
|
||||
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
|
||||
- sh cmake_installer.sh --skip-license --exclude-subdir --prefix=/usr/local
|
||||
- rm -rf build && mkdir -p build && cd build
|
||||
- cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr ..
|
||||
|
@ -39,6 +39,7 @@ if(NOT termcolor_FOUND)
|
||||
endif()
|
||||
endif()
|
||||
find_package(Threads REQUIRED)
|
||||
find_package(pugixml 1 REQUIRED CONFIG)
|
||||
|
||||
add_subdirectory(src)
|
||||
|
||||
|
@ -15,6 +15,7 @@
|
||||
:uri-fmt: https://github.com/fmtlib/fmt
|
||||
:uri-asciidoc: http://asciidoc.org/
|
||||
:uri-termcolor: https://termcolor.readthedocs.io/
|
||||
:uri-pugixml: https://pugixml.org/
|
||||
|
||||
:license: https://schlomp.space/tastytea/{project}/src/branch/main/LICENSE
|
||||
:license-termcolor: https://schlomp.space/tastytea/{project}/src/branch/main/dist/termcolor/LICENSE
|
||||
@ -84,6 +85,7 @@ If you get the error message that `add-apt-repository` was not found, install
|
||||
* link:{uri-asciidoc}[AsciiDoc] (tested: 9.0 / 8.6)
|
||||
* link:{uri-termcolor}[Termcolor] (tested: 2.0) (If not found, the bundled
|
||||
version is used.)
|
||||
* link:{uri-pugixml}[pugixml] (tested: 1.11 / 1.8)
|
||||
* Optional
|
||||
** Tests: link:{uri-catch}[Catch] (tested: 2.13 / 1.10)
|
||||
|
||||
@ -95,7 +97,9 @@ of CMake.
|
||||
|
||||
[source,shell]
|
||||
--------------------------------------------------------------------------------
|
||||
sudo apt install build-essential cmake libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc
|
||||
sudo apt install build-essential cmake libboost-program-options-dev \
|
||||
libboost-locale-dev libboost-regex-dev gettext libarchive-dev \
|
||||
libfmt-dev asciidoc libpugixml-dev
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
==== Get sourcecode
|
||||
|
@ -2,7 +2,7 @@
|
||||
:doctype: manpage
|
||||
:Author: tastytea
|
||||
:Email: tastytea@tastytea.de
|
||||
:Date: 2021-05-28
|
||||
:Date: 2021-05-29
|
||||
:Revision: 0.0.0
|
||||
:man source: epubgrep
|
||||
:man manual: General Commands Manual
|
||||
@ -50,7 +50,8 @@ Ignore case distinctions in pattern and data.
|
||||
Use additional _PATTERN_ for matching. Can be used more than once.
|
||||
|
||||
*-a*, *--raw*::
|
||||
Do not clean up text before searching. No HTML stripping, no newline removal.
|
||||
Do not clean up text before searching. No HTML stripping, no newline removal,
|
||||
all files will be read (not just the text documents listed in the spine).
|
||||
|
||||
*-C* _NUMBER_, *context* _NUMBER_::
|
||||
Print _NUMBER_ words of context around matches.
|
||||
|
@ -24,7 +24,8 @@ target_link_libraries(${PROJECT_NAME}_lib
|
||||
fmt::fmt
|
||||
termcolor::termcolor
|
||||
Threads::Threads
|
||||
m)
|
||||
m
|
||||
pugixml)
|
||||
|
||||
if(${CMAKE_VERSION} VERSION_LESS 3.17)
|
||||
target_link_libraries(${PROJECT_NAME}_lib
|
||||
|
@ -62,7 +62,16 @@ std::vector<match> search(const fs::path &filepath,
|
||||
|
||||
const boost::regex re(regex.data(), flags);
|
||||
std::vector<match> matches;
|
||||
for (const auto &entry : zip::list(filepath))
|
||||
std::vector<string> epub_filepaths{[&opts, &filepath]
|
||||
{
|
||||
if (!opts.raw)
|
||||
{
|
||||
return zip::list_spine(filepath);
|
||||
}
|
||||
return zip::list(filepath);
|
||||
}()};
|
||||
|
||||
for (const auto &entry : epub_filepaths)
|
||||
{
|
||||
auto document{zip::read_file(filepath, entry)};
|
||||
if (!opts.raw)
|
||||
|
99
src/zip.cpp
99
src/zip.cpp
@ -23,6 +23,7 @@
|
||||
#include <boost/locale/message.hpp>
|
||||
#include <fmt/format.h>
|
||||
#include <fmt/ostream.h> // For compatibility with fmt 4.
|
||||
#include <pugixml.hpp>
|
||||
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
@ -30,6 +31,7 @@
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
|
||||
namespace epubgrep::zip
|
||||
@ -50,12 +52,12 @@ std::vector<std::string> list(const fs::path &filepath)
|
||||
if (in_epub_filepath == nullptr)
|
||||
{ // If the encoding is broken, we skip the file.
|
||||
std::cerr << translate("WARNING: ")
|
||||
<< format(translate("{0:s} is damaged. "
|
||||
"Skipping rest of file.\n")
|
||||
<< format(translate("File in {0:s} is damaged. "
|
||||
"Skipping in-EPUB file.\n")
|
||||
.str()
|
||||
.data(),
|
||||
filepath);
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
toc.emplace_back(in_epub_filepath);
|
||||
archive_read_data_skip(zipfile);
|
||||
@ -74,6 +76,16 @@ std::string read_file(const fs::path &filepath, std::string_view entry_path)
|
||||
while (archive_read_next_header(zipfile, &entry) == ARCHIVE_OK)
|
||||
{
|
||||
const auto *path{archive_entry_pathname_utf8(entry)};
|
||||
if (path == nullptr)
|
||||
{ // If the encoding is broken, we skip the file.
|
||||
std::cerr << translate("WARNING: ")
|
||||
<< format(translate("File in {0:s} is damaged. "
|
||||
"Skipping in-EPUB file.\n")
|
||||
.str()
|
||||
.data(),
|
||||
filepath);
|
||||
continue;
|
||||
}
|
||||
if (std::strcmp(path, entry_path.data()) == 0)
|
||||
{
|
||||
const auto length{static_cast<size_t>(archive_entry_size(entry))};
|
||||
@ -140,4 +152,85 @@ void close_file(struct archive *zipfile, const fs::path &filepath)
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> list_spine(const fs::path &filepath)
|
||||
{
|
||||
const fs::path opf_file_path{
|
||||
[&filepath]
|
||||
{
|
||||
pugi::xml_document xml;
|
||||
const std::string container{
|
||||
read_file(filepath, "META-INF/container.xml")};
|
||||
const auto result{xml.load_buffer(&container[0], container.size())};
|
||||
if (result)
|
||||
{
|
||||
return xml.child("container")
|
||||
.child("rootfiles")
|
||||
.first_child()
|
||||
.attribute("full-path")
|
||||
.value();
|
||||
}
|
||||
|
||||
return "";
|
||||
}()};
|
||||
|
||||
std::vector<std::string> spine_filepaths;
|
||||
if (!opf_file_path.empty())
|
||||
{
|
||||
pugi::xml_document xml;
|
||||
const std::string opf_file{read_file(filepath, opf_file_path.string())};
|
||||
const auto result{xml.load_buffer(&opf_file[0], opf_file.size())};
|
||||
if (result)
|
||||
{
|
||||
auto manifest{xml.child("package").child("manifest")};
|
||||
auto spine{xml.child("package").child("spine")};
|
||||
|
||||
for (const auto &itemref : spine)
|
||||
{
|
||||
const auto &idref{itemref.attribute("idref").value()};
|
||||
const auto &item{manifest.find_child_by_attribute("id", idref)};
|
||||
const std::string href{
|
||||
urldecode(item.attribute("href").value())};
|
||||
if (href[0] != '/')
|
||||
{
|
||||
spine_filepaths.emplace_back(
|
||||
opf_file_path.parent_path() /= href);
|
||||
continue;
|
||||
}
|
||||
spine_filepaths.emplace_back(href);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (opf_file_path.empty() || spine_filepaths.empty())
|
||||
{
|
||||
std::cerr << translate("ERROR: ")
|
||||
<< format(translate("{0:s} is damaged. Could not read spine. "
|
||||
"Skipping file.\n")
|
||||
.str()
|
||||
.data(),
|
||||
filepath);
|
||||
return {};
|
||||
}
|
||||
|
||||
return spine_filepaths;
|
||||
}
|
||||
|
||||
std::string urldecode(const std::string_view url)
|
||||
{ // RFC 3986, section 2.1.
|
||||
size_t pos{0};
|
||||
size_t lastpos{0};
|
||||
std::string decoded;
|
||||
while ((pos = url.find('%', pos)) != std::string_view::npos)
|
||||
{
|
||||
decoded += url.substr(lastpos, pos - lastpos);
|
||||
decoded += static_cast<char>(
|
||||
std::stoul(std::string(url.substr(pos + 1, 2)), nullptr, 16));
|
||||
pos += 3;
|
||||
lastpos = pos;
|
||||
}
|
||||
decoded += url.substr(lastpos);
|
||||
|
||||
return decoded;
|
||||
}
|
||||
|
||||
} // namespace epubgrep::zip
|
||||
|
@ -43,6 +43,12 @@ namespace epubgrep::zip
|
||||
//! Close zip file.
|
||||
void close_file(struct archive *zipfile, const fs::path &filepath);
|
||||
|
||||
//! Returns the files in the EPUB “spine” (all pages that are actually text).
|
||||
[[nodiscard]] std::vector<std::string> list_spine(const fs::path &filepath);
|
||||
|
||||
//! Decode percent-encoding. Used for restricted characters in URLs.
|
||||
[[nodiscard]] std::string urldecode(std::string_view url);
|
||||
|
||||
//! It's std::runtime_error, but with another name.
|
||||
class exception : public std::runtime_error
|
||||
{
|
||||
|
@ -23,6 +23,7 @@ SCENARIO("Searching works")
|
||||
{
|
||||
std::vector<epubgrep::search::match> matches;
|
||||
epubgrep::search::settings opts;
|
||||
opts.raw = true;
|
||||
|
||||
WHEN("We search for ‘📙+\\w?’ using extended regular expressions")
|
||||
{
|
||||
@ -63,17 +64,19 @@ SCENARIO("Searching works")
|
||||
REQUIRE_FALSE(exception);
|
||||
REQUIRE(matches.at(0).filepath == "test folder/😊");
|
||||
REQUIRE(matches.at(0).text == "📗");
|
||||
REQUIRE(matches.at(0).context.first == "📖 📘");
|
||||
REQUIRE(matches.at(0).context.second == "📙 ");
|
||||
REQUIRE(matches.at(0).context.first == "📖\n\n📘");
|
||||
REQUIRE(matches.at(0).context.second == "📙\n");
|
||||
}
|
||||
}
|
||||
|
||||
WHEN("We search for ‘ ’ (space) with context = 1.")
|
||||
WHEN("We search for ‘[ \\n]’ with context = 1.")
|
||||
{
|
||||
try
|
||||
{
|
||||
opts.context = 1;
|
||||
matches = epubgrep::search::search(zipfile, " ", opts);
|
||||
opts.regex = epubgrep::options::regex_kind::perl;
|
||||
matches = epubgrep::search::search(zipfile, R"([ \n])",
|
||||
opts);
|
||||
}
|
||||
catch (const std::exception &)
|
||||
{
|
||||
@ -83,19 +86,24 @@ SCENARIO("Searching works")
|
||||
THEN("No exception is thrown")
|
||||
AND_THEN("It returns the match correctly")
|
||||
{
|
||||
// I looked at this a week or so after I've written it, and
|
||||
// I have come to the realization that this is a tiny bit
|
||||
// more complicated than strictly required. 😄
|
||||
// TODO: Rewrite test.zip and tests to be better
|
||||
// understandable.
|
||||
REQUIRE_FALSE(exception);
|
||||
REQUIRE(matches.at(1).filepath == "test folder/test file");
|
||||
REQUIRE(matches.at(1).text == " ");
|
||||
REQUIRE(matches.at(1).context.first == "don't");
|
||||
REQUIRE(matches.at(1).context.second == "want to");
|
||||
REQUIRE(matches.at(10).filepath == "test folder/😊");
|
||||
REQUIRE(matches.at(10).text == " ");
|
||||
REQUIRE(matches.at(10).text == "\n");
|
||||
REQUIRE(matches.at(10).context.first == "📖");
|
||||
REQUIRE(matches.at(10).context.second == "📘📗📙 ");
|
||||
REQUIRE(matches.at(11).filepath == "test folder/😊");
|
||||
REQUIRE(matches.at(11).text == " ");
|
||||
REQUIRE(matches.at(11).context.first == "📘📗📙");
|
||||
REQUIRE(matches.at(11).context.second == "");
|
||||
REQUIRE(matches.at(10).context.second == "\n📘📗📙\n");
|
||||
REQUIRE(matches.at(12).filepath == "test folder/😊");
|
||||
REQUIRE(matches.at(12).text == "\n");
|
||||
REQUIRE(matches.at(12).context.first == "📘📗📙");
|
||||
REQUIRE(matches.at(12).context.second.empty());
|
||||
}
|
||||
}
|
||||
|
||||
@ -119,7 +127,7 @@ SCENARIO("Searching works")
|
||||
{
|
||||
REQUIRE_FALSE(exception);
|
||||
REQUIRE(matches.at(0).filepath == "test folder/test file");
|
||||
REQUIRE(matches.at(0).text == "work today. I'm stay");
|
||||
REQUIRE(matches.at(0).text == "work today.\nI'm stay");
|
||||
REQUIRE(matches.at(0).context.first == "to ");
|
||||
REQUIRE(matches.at(0).context.second == "ing in");
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user