Archive URLs.

This commit is contained in:
tastytea 2019-05-15 22:24:11 +02:00
parent 1c39c8ac86
commit cc912b8a36
Signed by: tastytea
GPG Key ID: CFC39497F1B26E07
3 changed files with 35 additions and 1 deletions

View File

@ -47,7 +47,7 @@ int main(const int argc, const char *argv[])
{
URL url(opts.url);
html_extract page = url.get();
db.store(opts.url, "archive", system_clock::now(), opts.tags,
db.store(opts.url, url.archive(), system_clock::now(), opts.tags,
page.title, page.description, page.fulltext);
}

View File

@ -406,3 +406,34 @@ const string URL::unescape_html(const string &html)
return output;
}
const string URL::archive()
{
try
{
std::ostringstream oss;
curlpp::Easy request;
request.setOpt<curlopts::UserAgent>(string("remwharead/")
+ global::version);
request.setOpt<curlopts::HttpHeader>({ "Connection: close" });
request.setOpt<curlopts::FollowLocation>(true);
request.setOpt<curlopts::Url>("https://web.archive.org/save/" + _url);
request.setOpt<curlopts::WriteStream>(&oss);
request.setOpt<curlopts::NoBody>(true); // Make a HEAD request.
request.setOpt<curlpp::options::Header>(true); // Save headers in oss.
request.perform();
smatch match;
const string answer = oss.str();
if (regex_search(answer, match, regex("Content-Location: (.+)\r\n")))
{
return "https://web.archive.org/" + match[1].str();
}
}
catch (const std::exception &e)
{
cerr << "Error: " << e.what() << endl;
}
return "";
}

View File

@ -33,7 +33,10 @@ class URL
public:
explicit URL(const string &url);
//! Download URL and extract title, description and full text.
const html_extract get();
//! Save URL in archive and return URL.
const string archive();
private:
string _url;