diff --git a/src/main.cpp b/src/main.cpp index 07cf1c9..87cad63 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -47,7 +47,7 @@ int main(const int argc, const char *argv[]) { URL url(opts.url); html_extract page = url.get(); - db.store(opts.url, "archive", system_clock::now(), opts.tags, + db.store(opts.url, url.archive(), system_clock::now(), opts.tags, page.title, page.description, page.fulltext); } diff --git a/src/url.cpp b/src/url.cpp index e7ce9f7..a281822 100644 --- a/src/url.cpp +++ b/src/url.cpp @@ -406,3 +406,34 @@ const string URL::unescape_html(const string &html) return output; } + +const string URL::archive() +{ + try + { + std::ostringstream oss; + curlpp::Easy request; + request.setOpt(string("remwharead/") + + global::version); + request.setOpt({ "Connection: close" }); + request.setOpt(true); + request.setOpt("https://web.archive.org/save/" + _url); + request.setOpt(&oss); + request.setOpt(true); // Make a HEAD request. + request.setOpt(true); // Save headers in oss. + request.perform(); + + smatch match; + const string answer = oss.str(); + if (regex_search(answer, match, regex("Content-Location: (.+)\r\n"))) + { + return "https://web.archive.org/" + match[1].str(); + } + } + catch (const std::exception &e) + { + cerr << "Error: " << e.what() << endl; + } + + return ""; +} diff --git a/src/url.hpp b/src/url.hpp index fe8641e..88c791b 100644 --- a/src/url.hpp +++ b/src/url.hpp @@ -33,7 +33,10 @@ class URL public: explicit URL(const string &url); + //! Download URL and extract title, description and full text. const html_extract get(); + //! Save URL in archive and return URL. + const string archive(); private: string _url;