diff --git a/remwharead.1.adoc b/remwharead.1.adoc index 75325a9..a1f7a93 100644 --- a/remwharead.1.adoc +++ b/remwharead.1.adoc @@ -13,20 +13,20 @@ remwharead - Remember what you read, and when == SYNOPSIS -*remwharead* [*-t* _tags_] _URL_ +*remwharead* [*-t* _tags_] _URI_ *remwharead* *-e* _format_ [*-f* _file_] [*-s* _start_,_end_] == DESCRIPTION -*remwharead* saves URLs of things you read in a database along with an URL to +*remwharead* saves URIs of things you read in a database along with an URI to the archived version, the current date and time, title, description, the full text of the page and optional tags. == OPTIONS *-t* _tags_, *--tags* _tags_:: -Add tags to _URL_, delimited by commas. +Add tags to _URI_, delimited by commas. *-e* _format_, *--export* _format_:: Export to _format_. Possible values are _csv_ and _asciidoc_. See _FORMATS_. diff --git a/src/main.cpp b/src/main.cpp index e3cb4de..8300c39 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -19,8 +19,8 @@ #include #include "sqlite.hpp" #include "parse_options.hpp" -#include "url.hpp" #include "csv.hpp" +#include "uri.hpp" using std::cout; using std::cerr; @@ -44,11 +44,11 @@ int main(const int argc, const char *argv[]) return 2; } - if (!opts.url.empty()) + if (!opts.uri.empty()) { - URL url(opts.url); - html_extract page = url.get(); - db.store({opts.url, url.archive(), system_clock::now(), opts.tags, + URI uri(opts.uri); + html_extract page = uri.get(); + db.store({opts.uri, uri.archive(), system_clock::now(), opts.tags, page.title, page.description, page.fulltext}); } diff --git a/src/parse_options.cpp b/src/parse_options.cpp index 6aaa5f4..0bee8e5 100644 --- a/src/parse_options.cpp +++ b/src/parse_options.cpp @@ -44,7 +44,7 @@ const options parse_options(const int argc, const char *argv[]) { popl::OptionParser op("Available options"); op.add> - ("t", "tags", "Add tags to URL, delimited by commas.", "", &tags); + ("t", "tags", "Add tags to URI, delimited by commas.", "", &tags); op.add> ("e", "export", "Export to format.", "", &format); op.add> @@ -62,7 +62,7 @@ const options parse_options(const int argc, const char *argv[]) if (option_help->is_set()) { - cout << "Usage: " << argv[0] << " [-t tags] URL\n" + cout << "Usage: " << argv[0] << " [-t tags] URI\n" << " " << argv[0] << " -e format [-f file] [-s start,end]\n"; cout << op; @@ -155,12 +155,12 @@ const options parse_options(const int argc, const char *argv[]) if (op.non_option_args().size() > 0) { - opts.url = op.non_option_args().front(); + opts.uri = op.non_option_args().front(); } - if (opts.url == "" && opts.format == export_format::undefined) + if (opts.uri == "" && opts.format == export_format::undefined) { - cerr << "Error: You have to specify either URL or --export.\n"; + cerr << "Error: You have to specify either URI or --export.\n"; return options(1); } } diff --git a/src/parse_options.hpp b/src/parse_options.hpp index add8160..716d7fc 100644 --- a/src/parse_options.hpp +++ b/src/parse_options.hpp @@ -39,7 +39,7 @@ typedef struct options string file; array span = { time_point(), system_clock::now() }; sort_attribute sort = sort_attribute::both; - string url; + string uri; uint8_t status_code = 0; options(); diff --git a/src/url.cpp b/src/uri.cpp similarity index 95% rename from src/url.cpp rename to src/uri.cpp index 191c9c7..4d496d9 100644 --- a/src/url.cpp +++ b/src/uri.cpp @@ -26,7 +26,7 @@ #include #include #include -#include "url.hpp" +#include "uri.hpp" namespace curlopts = curlpp::options; using std::uint64_t; @@ -37,12 +37,12 @@ using std::regex_replace; using std::regex_search; using std::smatch; -URL::URL(const string &url) - :_url(url) +URI::URI(const string &uri) + :_uri(uri) { } -const html_extract URL::get() +const html_extract URI::get() { try { @@ -52,7 +52,7 @@ const html_extract URL::get() + global::version); request.setOpt({ "Connection: close" }); request.setOpt(true); - request.setOpt(_url); + request.setOpt(_uri); request.setOpt(&oss); request.perform(); @@ -80,21 +80,21 @@ const html_extract URL::get() return { "", "", "" }; } -const string URL::extract_title(const string &html) +const string URI::extract_title(const string &html) { smatch match; regex_search(html, match, regex("([^<]+)")); - return match[1].str(); + return remove_newlines(match[1].str()); } -const string URL::extract_description(const string &html) +const string URI::extract_description(const string &html) { smatch match; regex_search(html, match, regex("description\"[^>]+content=\"([^\"]+)")); - return match[1].str(); + return remove_newlines(match[1].str()); } -const string URL::strip_html(const string &html) +const string URI::strip_html(const string &html) { string out; out = regex_replace(html, regex("<script[^<]+"), ""); // Remove JavaScript. @@ -107,7 +107,7 @@ const string URL::strip_html(const string &html) return unescape_html(out); } -const string URL::unescape_html(const string &html) +const string URI::unescape_html(const string &html) { string buffer = html; string output; @@ -407,9 +407,9 @@ const string URL::unescape_html(const string &html) return output; } -const string URL::archive() +const string URI::archive() { - if (_url.substr(0, 4) != "http") + if (_uri.substr(0, 4) != "http") { return ""; } @@ -422,7 +422,7 @@ const string URL::archive() + global::version); request.setOpt<curlopts::HttpHeader>({ "Connection: close" }); request.setOpt<curlopts::FollowLocation>(true); - request.setOpt<curlopts::Url>("https://web.archive.org/save/" + _url); + request.setOpt<curlopts::Url>("https://web.archive.org/save/" + _uri); request.setOpt<curlopts::WriteStream>(&oss); request.setOpt<curlopts::NoBody>(true); // Make a HEAD request. request.setOpt<curlpp::options::Header>(true); // Save headers in oss. @@ -442,3 +442,8 @@ const string URL::archive() return ""; } + +const string URI::remove_newlines(const string &text) +{ + return regex_replace(text, regex("\n"), " "); +} diff --git a/src/url.hpp b/src/uri.hpp similarity index 78% rename from src/url.hpp rename to src/uri.hpp index 88c791b..d8be0eb 100644 --- a/src/url.hpp +++ b/src/uri.hpp @@ -14,8 +14,8 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. */ -#ifndef REMWHAREAD_URL_HPP -#define REMWHAREAD_URL_HPP +#ifndef REMWHAREAD_URI_HPP +#define REMWHAREAD_URI_HPP #include <string> @@ -28,23 +28,24 @@ typedef struct html_extract string fulltext; } html_extract; -class URL +class URI { public: - explicit URL(const string &url); + explicit URI(const string &uri); - //! Download URL and extract title, description and full text. + //! Download URI and extract title, description and full text. const html_extract get(); - //! Save URL in archive and return URL. + //! Save URI in archive and return URI. const string archive(); private: - string _url; + string _uri; const string extract_title(const string &html); const string extract_description(const string &html); const string strip_html(const string &html); const string unescape_html(const string &html); + const string remove_newlines(const string &text); }; -#endif // REMWHAREAD_URL_HPP +#endif // REMWHAREAD_URI_HPP diff --git a/tests/test_parse_options.cpp b/tests/test_parse_options.cpp index 7ebfd86..34451d8 100644 --- a/tests/test_parse_options.cpp +++ b/tests/test_parse_options.cpp @@ -27,7 +27,7 @@ SCENARIO ("The option parser works correctly") { bool exception = false; options opts; - const string url = "https://example.com/article.html"; + const string uri = "https://example.com/article.html"; WHEN ("The options are --help --file test") { @@ -75,12 +75,12 @@ SCENARIO ("The option parser works correctly") } } - WHEN ("The options are -t 💩 " + url) + WHEN ("The options are -t 💩 " + uri) { try { const char *argv[] - = { "remwharead", "-t", "💩", url.c_str() }; + = { "remwharead", "-t", "💩", uri.c_str() }; opts = parse_options(4, argv); } catch (const std::exception &e) @@ -95,7 +95,7 @@ SCENARIO ("The option parser works correctly") REQUIRE_FALSE(exception); REQUIRE(opts.status_code == 0); REQUIRE(opts.tags == vector<string>{ "💩" }); - REQUIRE(opts.url == url); + REQUIRE(opts.uri == uri); } } @@ -115,7 +115,7 @@ SCENARIO ("The option parser works correctly") "remwharead", "-t", tags.c_str(), - url.c_str() + uri.c_str() }; opts = parse_options(4, argv); } @@ -131,7 +131,7 @@ SCENARIO ("The option parser works correctly") REQUIRE_FALSE(exception); REQUIRE(opts.status_code == 0); REQUIRE(opts.tags == vector<string>{ "tag1", longstring, "tag3" }); - REQUIRE(opts.url == url); + REQUIRE(opts.uri == uri); } } }