Renamed URL to URI.

This commit is contained in:
tastytea 2019-05-16 08:36:35 +02:00
parent 98743148f9
commit 914d22569f
Signed by: tastytea
GPG Key ID: CFC39497F1B26E07
7 changed files with 48 additions and 42 deletions

View File

@ -13,20 +13,20 @@ remwharead - Remember what you read, and when
== SYNOPSIS == SYNOPSIS
*remwharead* [*-t* _tags_] _URL_ *remwharead* [*-t* _tags_] _URI_
*remwharead* *-e* _format_ [*-f* _file_] [*-s* _start_,_end_] *remwharead* *-e* _format_ [*-f* _file_] [*-s* _start_,_end_]
== DESCRIPTION == DESCRIPTION
*remwharead* saves URLs of things you read in a database along with an URL to *remwharead* saves URIs of things you read in a database along with an URI to
the archived version, the current date and time, title, description, the full the archived version, the current date and time, title, description, the full
text of the page and optional tags. text of the page and optional tags.
== OPTIONS == OPTIONS
*-t* _tags_, *--tags* _tags_:: *-t* _tags_, *--tags* _tags_::
Add tags to _URL_, delimited by commas. Add tags to _URI_, delimited by commas.
*-e* _format_, *--export* _format_:: *-e* _format_, *--export* _format_::
Export to _format_. Possible values are _csv_ and _asciidoc_. See _FORMATS_. Export to _format_. Possible values are _csv_ and _asciidoc_. See _FORMATS_.

View File

@ -19,8 +19,8 @@
#include <chrono> #include <chrono>
#include "sqlite.hpp" #include "sqlite.hpp"
#include "parse_options.hpp" #include "parse_options.hpp"
#include "url.hpp"
#include "csv.hpp" #include "csv.hpp"
#include "uri.hpp"
using std::cout; using std::cout;
using std::cerr; using std::cerr;
@ -44,11 +44,11 @@ int main(const int argc, const char *argv[])
return 2; return 2;
} }
if (!opts.url.empty()) if (!opts.uri.empty())
{ {
URL url(opts.url); URI uri(opts.uri);
html_extract page = url.get(); html_extract page = uri.get();
db.store({opts.url, url.archive(), system_clock::now(), opts.tags, db.store({opts.uri, uri.archive(), system_clock::now(), opts.tags,
page.title, page.description, page.fulltext}); page.title, page.description, page.fulltext});
} }

View File

@ -44,7 +44,7 @@ const options parse_options(const int argc, const char *argv[])
{ {
popl::OptionParser op("Available options"); popl::OptionParser op("Available options");
op.add<popl::Value<string>> op.add<popl::Value<string>>
("t", "tags", "Add tags to URL, delimited by commas.", "", &tags); ("t", "tags", "Add tags to URI, delimited by commas.", "", &tags);
op.add<popl::Value<string>> op.add<popl::Value<string>>
("e", "export", "Export to format.", "", &format); ("e", "export", "Export to format.", "", &format);
op.add<popl::Value<string>> op.add<popl::Value<string>>
@ -62,7 +62,7 @@ const options parse_options(const int argc, const char *argv[])
if (option_help->is_set()) if (option_help->is_set())
{ {
cout << "Usage: " << argv[0] << " [-t tags] URL\n" cout << "Usage: " << argv[0] << " [-t tags] URI\n"
<< " " << argv[0] << " " << argv[0]
<< " -e format [-f file] [-s start,end]\n"; << " -e format [-f file] [-s start,end]\n";
cout << op; cout << op;
@ -155,12 +155,12 @@ const options parse_options(const int argc, const char *argv[])
if (op.non_option_args().size() > 0) if (op.non_option_args().size() > 0)
{ {
opts.url = op.non_option_args().front(); opts.uri = op.non_option_args().front();
} }
if (opts.url == "" && opts.format == export_format::undefined) if (opts.uri == "" && opts.format == export_format::undefined)
{ {
cerr << "Error: You have to specify either URL or --export.\n"; cerr << "Error: You have to specify either URI or --export.\n";
return options(1); return options(1);
} }
} }

View File

@ -39,7 +39,7 @@ typedef struct options
string file; string file;
array<time_point, 2> span = { time_point(), system_clock::now() }; array<time_point, 2> span = { time_point(), system_clock::now() };
sort_attribute sort = sort_attribute::both; sort_attribute sort = sort_attribute::both;
string url; string uri;
uint8_t status_code = 0; uint8_t status_code = 0;
options(); options();

View File

@ -26,7 +26,7 @@
#include <curlpp/Exception.hpp> #include <curlpp/Exception.hpp>
#include <curlpp/Infos.hpp> #include <curlpp/Infos.hpp>
#include <version.hpp> #include <version.hpp>
#include "url.hpp" #include "uri.hpp"
namespace curlopts = curlpp::options; namespace curlopts = curlpp::options;
using std::uint64_t; using std::uint64_t;
@ -37,12 +37,12 @@ using std::regex_replace;
using std::regex_search; using std::regex_search;
using std::smatch; using std::smatch;
URL::URL(const string &url) URI::URI(const string &uri)
:_url(url) :_uri(uri)
{ {
} }
const html_extract URL::get() const html_extract URI::get()
{ {
try try
{ {
@ -52,7 +52,7 @@ const html_extract URL::get()
+ global::version); + global::version);
request.setOpt<curlopts::HttpHeader>({ "Connection: close" }); request.setOpt<curlopts::HttpHeader>({ "Connection: close" });
request.setOpt<curlopts::FollowLocation>(true); request.setOpt<curlopts::FollowLocation>(true);
request.setOpt<curlopts::Url>(_url); request.setOpt<curlopts::Url>(_uri);
request.setOpt<curlopts::WriteStream>(&oss); request.setOpt<curlopts::WriteStream>(&oss);
request.perform(); request.perform();
@ -80,21 +80,21 @@ const html_extract URL::get()
return { "", "", "" }; return { "", "", "" };
} }
const string URL::extract_title(const string &html) const string URI::extract_title(const string &html)
{ {
smatch match; smatch match;
regex_search(html, match, regex("<title>([^<]+)")); regex_search(html, match, regex("<title>([^<]+)"));
return match[1].str(); return remove_newlines(match[1].str());
} }
const string URL::extract_description(const string &html) const string URI::extract_description(const string &html)
{ {
smatch match; smatch match;
regex_search(html, match, regex("description\"[^>]+content=\"([^\"]+)")); regex_search(html, match, regex("description\"[^>]+content=\"([^\"]+)"));
return match[1].str(); return remove_newlines(match[1].str());
} }
const string URL::strip_html(const string &html) const string URI::strip_html(const string &html)
{ {
string out; string out;
out = regex_replace(html, regex("<script[^<]+"), ""); // Remove JavaScript. out = regex_replace(html, regex("<script[^<]+"), ""); // Remove JavaScript.
@ -107,7 +107,7 @@ const string URL::strip_html(const string &html)
return unescape_html(out); return unescape_html(out);
} }
const string URL::unescape_html(const string &html) const string URI::unescape_html(const string &html)
{ {
string buffer = html; string buffer = html;
string output; string output;
@ -407,9 +407,9 @@ const string URL::unescape_html(const string &html)
return output; return output;
} }
const string URL::archive() const string URI::archive()
{ {
if (_url.substr(0, 4) != "http") if (_uri.substr(0, 4) != "http")
{ {
return ""; return "";
} }
@ -422,7 +422,7 @@ const string URL::archive()
+ global::version); + global::version);
request.setOpt<curlopts::HttpHeader>({ "Connection: close" }); request.setOpt<curlopts::HttpHeader>({ "Connection: close" });
request.setOpt<curlopts::FollowLocation>(true); request.setOpt<curlopts::FollowLocation>(true);
request.setOpt<curlopts::Url>("https://web.archive.org/save/" + _url); request.setOpt<curlopts::Url>("https://web.archive.org/save/" + _uri);
request.setOpt<curlopts::WriteStream>(&oss); request.setOpt<curlopts::WriteStream>(&oss);
request.setOpt<curlopts::NoBody>(true); // Make a HEAD request. request.setOpt<curlopts::NoBody>(true); // Make a HEAD request.
request.setOpt<curlpp::options::Header>(true); // Save headers in oss. request.setOpt<curlpp::options::Header>(true); // Save headers in oss.
@ -442,3 +442,8 @@ const string URL::archive()
return ""; return "";
} }
const string URI::remove_newlines(const string &text)
{
return regex_replace(text, regex("\n"), " ");
}

View File

@ -14,8 +14,8 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>. * along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
#ifndef REMWHAREAD_URL_HPP #ifndef REMWHAREAD_URI_HPP
#define REMWHAREAD_URL_HPP #define REMWHAREAD_URI_HPP
#include <string> #include <string>
@ -28,23 +28,24 @@ typedef struct html_extract
string fulltext; string fulltext;
} html_extract; } html_extract;
class URL class URI
{ {
public: public:
explicit URL(const string &url); explicit URI(const string &uri);
//! Download URL and extract title, description and full text. //! Download URI and extract title, description and full text.
const html_extract get(); const html_extract get();
//! Save URL in archive and return URL. //! Save URI in archive and return URI.
const string archive(); const string archive();
private: private:
string _url; string _uri;
const string extract_title(const string &html); const string extract_title(const string &html);
const string extract_description(const string &html); const string extract_description(const string &html);
const string strip_html(const string &html); const string strip_html(const string &html);
const string unescape_html(const string &html); const string unescape_html(const string &html);
const string remove_newlines(const string &text);
}; };
#endif // REMWHAREAD_URL_HPP #endif // REMWHAREAD_URI_HPP

View File

@ -27,7 +27,7 @@ SCENARIO ("The option parser works correctly")
{ {
bool exception = false; bool exception = false;
options opts; options opts;
const string url = "https://example.com/article.html"; const string uri = "https://example.com/article.html";
WHEN ("The options are --help --file test") WHEN ("The options are --help --file test")
{ {
@ -75,12 +75,12 @@ SCENARIO ("The option parser works correctly")
} }
} }
WHEN ("The options are -t 💩 " + url) WHEN ("The options are -t 💩 " + uri)
{ {
try try
{ {
const char *argv[] const char *argv[]
= { "remwharead", "-t", "💩", url.c_str() }; = { "remwharead", "-t", "💩", uri.c_str() };
opts = parse_options(4, argv); opts = parse_options(4, argv);
} }
catch (const std::exception &e) catch (const std::exception &e)
@ -95,7 +95,7 @@ SCENARIO ("The option parser works correctly")
REQUIRE_FALSE(exception); REQUIRE_FALSE(exception);
REQUIRE(opts.status_code == 0); REQUIRE(opts.status_code == 0);
REQUIRE(opts.tags == vector<string>{ "💩" }); REQUIRE(opts.tags == vector<string>{ "💩" });
REQUIRE(opts.url == url); REQUIRE(opts.uri == uri);
} }
} }
@ -115,7 +115,7 @@ SCENARIO ("The option parser works correctly")
"remwharead", "remwharead",
"-t", "-t",
tags.c_str(), tags.c_str(),
url.c_str() uri.c_str()
}; };
opts = parse_options(4, argv); opts = parse_options(4, argv);
} }
@ -131,7 +131,7 @@ SCENARIO ("The option parser works correctly")
REQUIRE_FALSE(exception); REQUIRE_FALSE(exception);
REQUIRE(opts.status_code == 0); REQUIRE(opts.status_code == 0);
REQUIRE(opts.tags == vector<string>{ "tag1", longstring, "tag3" }); REQUIRE(opts.tags == vector<string>{ "tag1", longstring, "tag3" });
REQUIRE(opts.url == url); REQUIRE(opts.uri == uri);
} }
} }
} }