#include "helpers.hpp"

#include <boost/regex.hpp>

#include <codecvt>
#include <locale>
#include <map>
#include <string>
#include <string_view>

namespace epubgrep::helpers
{

bool is_whitespace(const char check)
{
    const std::array<char, 4> whitespace{' ', '\n', '\r', '\t'};
    return std::any_of(whitespace.begin(), whitespace.end(),
                       [&check](const char ws) { return check == ws; });
}

std::string urldecode(const std::string_view url)
{ // RFC 3986, section 2.1.
    size_t pos{0};
    size_t lastpos{0};
    std::string decoded;
    while ((pos = url.find('%', pos)) != std::string_view::npos)
    {
        decoded += url.substr(lastpos, pos - lastpos);
        decoded += static_cast<char>(
            std::stoul(std::string(url.substr(pos + 1, 2)), nullptr, 16));
        pos += 3;
        lastpos = pos;
    }
    decoded += url.substr(lastpos);

    return decoded;
}

std::string unescape_html(const std::string_view html)
{
    std::string output;
    output.reserve(html.size());

    // Source: https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_
    //         entity_references#Character_entity_references_in_HTML
    static const std::map<std::string_view, char32_t>
        names{{"exclamation", 0x0021}, {"quot", 0x0022},    {"percent", 0x0025},
              {"amp", 0x0026},         {"apos", 0x0027},    {"add", 0x002B},
              {"lt", 0x003C},          {"equal", 0x003D},   {"gt", 0x003E},
              {"nbsp", 0x00A0},        {"iexcl", 0x00A1},   {"cent", 0x00A2},
              {"pound", 0x00A3},       {"curren", 0x00A4},  {"yen", 0x00A5},
              {"brvbar", 0x00A6},      {"sect", 0x00A7},    {"uml", 0x00A8},
              {"copy", 0x00A9},        {"ordf", 0x00AA},    {"laquo", 0x00AB},
              {"not", 0x00AC},         {"shy", 0x00AD},     {"reg", 0x00AE},
              {"macr", 0x00AF},        {"deg", 0x00B0},     {"plusmn", 0x00B1},
              {"sup2", 0x00B2},        {"sup3", 0x00B3},    {"acute", 0x00B4},
              {"micro", 0x00B5},       {"para", 0x00B6},    {"middot", 0x00B7},
              {"cedil", 0x00B8},       {"sup1", 0x00B9},    {"ordm", 0x00BA},
              {"raquo", 0x00BB},       {"frac14", 0x00BC},  {"frac12", 0x00BD},
              {"frac34", 0x00BE},      {"iquest", 0x00BF},  {"Agrave", 0x00C0},
              {"Aacute", 0x00C1},      {"Acirc", 0x00C2},   {"Atilde", 0x00C3},
              {"Auml", 0x00C4},        {"Aring", 0x00C5},   {"AElig", 0x00C6},
              {"Ccedil", 0x00C7},      {"Egrave", 0x00C8},  {"Eacute", 0x00C9},
              {"Ecirc", 0x00CA},       {"Euml", 0x00CB},    {"Igrave", 0x00CC},
              {"Iacute", 0x00CD},      {"Icirc", 0x00CE},   {"Iuml", 0x00CF},
              {"ETH", 0x00D0},         {"Ntilde", 0x00D1},  {"Ograve", 0x00D2},
              {"Oacute", 0x00D3},      {"Ocirc", 0x00D4},   {"Otilde", 0x00D5},
              {"Ouml", 0x00D6},        {"times", 0x00D7},   {"Oslash", 0x00D8},
              {"Ugrave", 0x00D9},      {"Uacute", 0x00DA},  {"Ucirc", 0x00DB},
              {"Uuml", 0x00DC},        {"Yacute", 0x00DD},  {"THORN", 0x00DE},
              {"szlig", 0x00DF},       {"agrave", 0x00E0},  {"aacute", 0x00E1},
              {"acirc", 0x00E2},       {"atilde", 0x00E3},  {"auml", 0x00E4},
              {"aring", 0x00E5},       {"aelig", 0x00E6},   {"ccedil", 0x00E7},
              {"egrave", 0x00E8},      {"eacute", 0x00E9},  {"ecirc", 0x00EA},
              {"euml", 0x00EB},        {"igrave", 0x00EC},  {"iacute", 0x00ED},
              {"icirc", 0x00EE},       {"iuml", 0x00EF},    {"eth", 0x00F0},
              {"ntilde", 0x00F1},      {"ograve", 0x00F2},  {"oacute", 0x00F3},
              {"ocirc", 0x00F4},       {"otilde", 0x00F5},  {"ouml", 0x00F6},
              {"divide", 0x00F7},      {"oslash", 0x00F8},  {"ugrave", 0x00F9},
              {"uacute", 0x00FA},      {"ucirc", 0x00FB},   {"uuml", 0x00FC},
              {"yacute", 0x00FD},      {"thorn", 0x00FE},   {"yuml", 0x00FF},
              {"OElig", 0x0152},       {"oelig", 0x0153},   {"Scaron", 0x0160},
              {"scaron", 0x0161},      {"Yuml", 0x0178},    {"fnof", 0x0192},
              {"circ", 0x02C6},        {"tilde", 0x02DC},   {"Alpha", 0x0391},
              {"Beta", 0x0392},        {"Gamma", 0x0393},   {"Delta", 0x0394},
              {"Epsilon", 0x0395},     {"Zeta", 0x0396},    {"Eta", 0x0397},
              {"Theta", 0x0398},       {"Iota", 0x0399},    {"Kappa", 0x039A},
              {"Lambda", 0x039B},      {"Mu", 0x039C},      {"Nu", 0x039D},
              {"Xi", 0x039E},          {"Omicron", 0x039F}, {"Pi", 0x03A0},
              {"Rho", 0x03A1},         {"Sigma", 0x03A3},   {"Tau", 0x03A4},
              {"Upsilon", 0x03A5},     {"Phi", 0x03A6},     {"Chi", 0x03A7},
              {"Psi", 0x03A8},         {"Omega", 0x03A9},   {"alpha", 0x03B1},
              {"beta", 0x03B2},        {"gamma", 0x03B3},   {"delta", 0x03B4},
              {"epsilon", 0x03B5},     {"zeta", 0x03B6},    {"eta", 0x03B7},
              {"theta", 0x03B8},       {"iota", 0x03B9},    {"kappa", 0x03BA},
              {"lambda", 0x03BB},      {"mu", 0x03BC},      {"nu", 0x03BD},
              {"xi", 0x03BE},          {"omicron", 0x03BF}, {"pi", 0x03C0},
              {"rho", 0x03C1},         {"sigmaf", 0x03C2},  {"sigma", 0x03C3},
              {"tau", 0x03C4},         {"upsilon", 0x03C5}, {"phi", 0x03C6},
              {"chi", 0x03C7},         {"psi", 0x03C8},     {"omega", 0x03C9},
              {"thetasym", 0x03D1},    {"upsih", 0x03D2},   {"piv", 0x03D6},
              {"ensp", 0x2002},        {"emsp", 0x2003},    {"thinsp", 0x2009},
              {"zwnj", 0x200C},        {"zwj", 0x200D},     {"lrm", 0x200E},
              {"rlm", 0x200F},         {"ndash", 0x2013},   {"mdash", 0x2014},
              {"horbar", 0x2015},      {"lsquo", 0x2018},   {"rsquo", 0x2019},
              {"sbquo", 0x201A},       {"ldquo", 0x201C},   {"rdquo", 0x201D},
              {"bdquo", 0x201E},       {"dagger", 0x2020},  {"Dagger", 0x2021},
              {"bull", 0x2022},        {"hellip", 0x2026},  {"permil", 0x2030},
              {"prime", 0x2032},       {"Prime", 0x2033},   {"lsaquo", 0x2039},
              {"rsaquo", 0x203A},      {"oline", 0x203E},   {"frasl", 0x2044},
              {"euro", 0x20AC},        {"image", 0x2111},   {"weierp", 0x2118},
              {"real", 0x211C},        {"trade", 0x2122},   {"alefsym", 0x2135},
              {"larr", 0x2190},        {"uarr", 0x2191},    {"rarr", 0x2192},
              {"darr", 0x2193},        {"harr", 0x2194},    {"crarr", 0x21B5},
              {"lArr", 0x21D0},        {"uArr", 0x21D1},    {"rArr", 0x21D2},
              {"dArr", 0x21D3},        {"hArr", 0x21D4},    {"forall", 0x2200},
              {"part", 0x2202},        {"exist", 0x2203},   {"empty", 0x2205},
              {"nabla", 0x2207},       {"isin", 0x2208},    {"notin", 0x2209},
              {"ni", 0x220B},          {"prod", 0x220F},    {"sum", 0x2211},
              {"minus", 0x2212},       {"lowast", 0x2217},  {"radic", 0x221A},
              {"prop", 0x221D},        {"infin", 0x221E},   {"ang", 0x2220},
              {"and", 0x2227},         {"or", 0x2228},      {"cap", 0x2229},
              {"cup", 0x222A},         {"int", 0x222B},     {"there4", 0x2234},
              {"sim", 0x223C},         {"cong", 0x2245},    {"asymp", 0x2248},
              {"ne", 0x2260},          {"equiv", 0x2261},   {"le", 0x2264},
              {"ge", 0x2265},          {"sub", 0x2282},     {"sup", 0x2283},
              {"nsub", 0x2284},        {"sube", 0x2286},    {"supe", 0x2287},
              {"oplus", 0x2295},       {"otimes", 0x2297},  {"perp", 0x22A5},
              {"sdot", 0x22C5},        {"lceil", 0x2308},   {"rceil", 0x2309},
              {"lfloor", 0x230A},      {"rfloor", 0x230B},  {"lang", 0x2329},
              {"rang", 0x232A},        {"loz", 0x25CA},     {"spades", 0x2660},
              {"clubs", 0x2663},       {"hearts", 0x2665},  {"diams", 0x2666}};

    // Used to convert number to utf-8 char.
    std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> u8c;
    // Matches numbered entities between 1 and 8 digits, decimal or hexadecimal,
    // or named entities.
    static const boost::regex re_entity{"&(#(x)?([[:alnum:]]{1,8})"
                                        "|[^;[:space:][:punct:]]+);"};
    std::string::const_iterator begin{html.begin()};
    std::string::const_iterator end{html.end()};
    boost::match_results<std::string::const_iterator> match;

    // Used for appending the rest of the text, after the last replacement.
    std::string::const_iterator end_last_match{begin};

    while (boost::regex_search(begin, end, match, re_entity))
    {
        output += match.prefix();
        try
        {
            const char32_t codepoint{
                [&match]
                {
                    // If it doesn't start with a '#' it is a named entity.
                    if (match[1].str()[0] != '#')
                    {
                        return names.at(match[1].str());
                    }
                    // 'x' after '#' means the number is hexadecimal.
                    if (match[2].length() == 1)
                    {
                        return static_cast<char32_t>(
                            std::stoul(match[3].str(), nullptr, 16));
                    }
                    // '#' without 'x' means the number is decimal.
                    return static_cast<char32_t>(
                        std::stoul(match[3].str(), nullptr, 10));
                }()};
            output += u8c.to_bytes(codepoint);
        }
        catch (const std::out_of_range &) // Named entity could not be found.
        {
            output += match.str();
        }
        begin = match[0].end();
        end_last_match = begin;
    }

    output += std::string(end_last_match, end);

    return output;
}

std::string_view get_env(const std::string_view name)
{
    const char *env = std::getenv(name.data());
    if (env != nullptr)
    {
        return env;
    }

    return {};
}

} // namespace epubgrep::helpers