Archive URLs.
This commit is contained in:
parent
1c39c8ac86
commit
cc912b8a36
|
@ -47,7 +47,7 @@ int main(const int argc, const char *argv[])
|
|||
{
|
||||
URL url(opts.url);
|
||||
html_extract page = url.get();
|
||||
db.store(opts.url, "archive", system_clock::now(), opts.tags,
|
||||
db.store(opts.url, url.archive(), system_clock::now(), opts.tags,
|
||||
page.title, page.description, page.fulltext);
|
||||
}
|
||||
|
||||
|
|
31
src/url.cpp
31
src/url.cpp
|
@ -406,3 +406,34 @@ const string URL::unescape_html(const string &html)
|
|||
|
||||
return output;
|
||||
}
|
||||
|
||||
const string URL::archive()
|
||||
{
|
||||
try
|
||||
{
|
||||
std::ostringstream oss;
|
||||
curlpp::Easy request;
|
||||
request.setOpt<curlopts::UserAgent>(string("remwharead/")
|
||||
+ global::version);
|
||||
request.setOpt<curlopts::HttpHeader>({ "Connection: close" });
|
||||
request.setOpt<curlopts::FollowLocation>(true);
|
||||
request.setOpt<curlopts::Url>("https://web.archive.org/save/" + _url);
|
||||
request.setOpt<curlopts::WriteStream>(&oss);
|
||||
request.setOpt<curlopts::NoBody>(true); // Make a HEAD request.
|
||||
request.setOpt<curlpp::options::Header>(true); // Save headers in oss.
|
||||
request.perform();
|
||||
|
||||
smatch match;
|
||||
const string answer = oss.str();
|
||||
if (regex_search(answer, match, regex("Content-Location: (.+)\r\n")))
|
||||
{
|
||||
return "https://web.archive.org/" + match[1].str();
|
||||
}
|
||||
}
|
||||
catch (const std::exception &e)
|
||||
{
|
||||
cerr << "Error: " << e.what() << endl;
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
|
|
@ -33,7 +33,10 @@ class URL
|
|||
public:
|
||||
explicit URL(const string &url);
|
||||
|
||||
//! Download URL and extract title, description and full text.
|
||||
const html_extract get();
|
||||
//! Save URL in archive and return URL.
|
||||
const string archive();
|
||||
|
||||
private:
|
||||
string _url;
|
||||
|
|
Loading…
Reference in New Issue
Block a user