// Copyright Toru Niina 2017. // Distributed under the MIT License. #ifndef TOML11_PARSER_HPP #define TOML11_PARSER_HPP #include #include #include #include "combinator.hpp" #include "lexer.hpp" #include "region.hpp" #include "result.hpp" #include "types.hpp" #include "value.hpp" #ifndef TOML11_DISABLE_STD_FILESYSTEM #ifdef __cpp_lib_filesystem #if __has_include() #define TOML11_HAS_STD_FILESYSTEM #include #endif // has_include() #endif // __cpp_lib_filesystem #endif // TOML11_DISABLE_STD_FILESYSTEM namespace toml { namespace detail { inline result, std::string> parse_boolean(location& loc) { const auto first = loc.iter(); if(const auto token = lex_boolean::invoke(loc)) { const auto reg = token.unwrap(); if (reg.str() == "true") {return ok(std::make_pair(true, reg));} else if(reg.str() == "false") {return ok(std::make_pair(false, reg));} else // internal error. { throw internal_error(format_underline( "toml::parse_boolean: internal error", {{source_location(reg), "invalid token"}}), source_location(reg)); } } loc.reset(first); //rollback return err(format_underline("toml::parse_boolean: ", {{source_location(loc), "the next token is not a boolean"}})); } inline result, std::string> parse_binary_integer(location& loc) { const auto first = loc.iter(); if(const auto token = lex_bin_int::invoke(loc)) { auto str = token.unwrap().str(); assert(str.size() > 2); // minimum -> 0b1 integer retval(0), base(1); for(auto i(str.rbegin()), e(str.rend() - 2); i!=e; ++i) { if (*i == '1'){retval += base; base *= 2;} else if(*i == '0'){base *= 2;} else if(*i == '_'){/* do nothing. */} else // internal error. { throw internal_error(format_underline( "toml::parse_integer: internal error", {{source_location(token.unwrap()), "invalid token"}}), source_location(loc)); } } return ok(std::make_pair(retval, token.unwrap())); } loc.reset(first); return err(format_underline("toml::parse_binary_integer:", {{source_location(loc), "the next token is not an integer"}})); } inline result, std::string> parse_octal_integer(location& loc) { const auto first = loc.iter(); if(const auto token = lex_oct_int::invoke(loc)) { auto str = token.unwrap().str(); str.erase(std::remove(str.begin(), str.end(), '_'), str.end()); str.erase(str.begin()); str.erase(str.begin()); // remove `0o` prefix std::istringstream iss(str); integer retval(0); iss >> std::oct >> retval; return ok(std::make_pair(retval, token.unwrap())); } loc.reset(first); return err(format_underline("toml::parse_octal_integer:", {{source_location(loc), "the next token is not an integer"}})); } inline result, std::string> parse_hexadecimal_integer(location& loc) { const auto first = loc.iter(); if(const auto token = lex_hex_int::invoke(loc)) { auto str = token.unwrap().str(); str.erase(std::remove(str.begin(), str.end(), '_'), str.end()); str.erase(str.begin()); str.erase(str.begin()); // remove `0x` prefix std::istringstream iss(str); integer retval(0); iss >> std::hex >> retval; return ok(std::make_pair(retval, token.unwrap())); } loc.reset(first); return err(format_underline("toml::parse_hexadecimal_integer", {{source_location(loc), "the next token is not an integer"}})); } inline result, std::string> parse_integer(location& loc) { const auto first = loc.iter(); if(first != loc.end() && *first == '0') { const auto second = std::next(first); if(second == loc.end()) // the token is just zero. { loc.advance(); return ok(std::make_pair(0, region(loc, first, second))); } if(*second == 'b') {return parse_binary_integer (loc);} // 0b1100 if(*second == 'o') {return parse_octal_integer (loc);} // 0o775 if(*second == 'x') {return parse_hexadecimal_integer(loc);} // 0xC0FFEE if(std::isdigit(*second)) { return err(format_underline("toml::parse_integer: " "leading zero in an Integer is not allowed.", {{source_location(loc), "leading zero"}})); } else if(std::isalpha(*second)) { return err(format_underline("toml::parse_integer: " "unknown integer prefix appeared.", {{source_location(loc), "none of 0x, 0o, 0b"}})); } } if(const auto token = lex_dec_int::invoke(loc)) { auto str = token.unwrap().str(); str.erase(std::remove(str.begin(), str.end(), '_'), str.end()); std::istringstream iss(str); integer retval(0); iss >> retval; return ok(std::make_pair(retval, token.unwrap())); } loc.reset(first); return err(format_underline("toml::parse_integer: ", {{source_location(loc), "the next token is not an integer"}})); } inline result, std::string> parse_floating(location& loc) { const auto first = loc.iter(); if(const auto token = lex_float::invoke(loc)) { auto str = token.unwrap().str(); if(str == "inf" || str == "+inf") { if(std::numeric_limits::has_infinity) { return ok(std::make_pair( std::numeric_limits::infinity(), token.unwrap())); } else { throw std::domain_error("toml::parse_floating: inf value found" " but the current environment does not support inf. Please" " make sure that the floating-point implementation conforms" " IEEE 754/ISO 60559 international standard."); } } else if(str == "-inf") { if(std::numeric_limits::has_infinity) { return ok(std::make_pair( -std::numeric_limits::infinity(), token.unwrap())); } else { throw std::domain_error("toml::parse_floating: inf value found" " but the current environment does not support inf. Please" " make sure that the floating-point implementation conforms" " IEEE 754/ISO 60559 international standard."); } } else if(str == "nan" || str == "+nan") { if(std::numeric_limits::has_quiet_NaN) { return ok(std::make_pair( std::numeric_limits::quiet_NaN(), token.unwrap())); } else if(std::numeric_limits::has_signaling_NaN) { return ok(std::make_pair( std::numeric_limits::signaling_NaN(), token.unwrap())); } else { throw std::domain_error("toml::parse_floating: NaN value found" " but the current environment does not support NaN. Please" " make sure that the floating-point implementation conforms" " IEEE 754/ISO 60559 international standard."); } } else if(str == "-nan") { if(std::numeric_limits::has_quiet_NaN) { return ok(std::make_pair( -std::numeric_limits::quiet_NaN(), token.unwrap())); } else if(std::numeric_limits::has_signaling_NaN) { return ok(std::make_pair( -std::numeric_limits::signaling_NaN(), token.unwrap())); } else { throw std::domain_error("toml::parse_floating: NaN value found" " but the current environment does not support NaN. Please" " make sure that the floating-point implementation conforms" " IEEE 754/ISO 60559 international standard."); } } str.erase(std::remove(str.begin(), str.end(), '_'), str.end()); std::istringstream iss(str); floating v(0.0); iss >> v; return ok(std::make_pair(v, token.unwrap())); } loc.reset(first); return err(format_underline("toml::parse_floating: ", {{source_location(loc), "the next token is not a float"}})); } inline std::string read_utf8_codepoint(const region& reg, const location& loc) { const auto str = reg.str().substr(1); std::uint_least32_t codepoint; std::istringstream iss(str); iss >> std::hex >> codepoint; const auto to_char = [](const std::uint_least32_t i) noexcept -> char { const auto uc = static_cast(i); return *reinterpret_cast(std::addressof(uc)); }; std::string character; if(codepoint < 0x80) // U+0000 ... U+0079 ; just an ASCII. { character += static_cast(codepoint); } else if(codepoint < 0x800) //U+0080 ... U+07FF { // 110yyyyx 10xxxxxx; 0x3f == 0b0011'1111 character += to_char(0xC0| codepoint >> 6); character += to_char(0x80|(codepoint & 0x3F)); } else if(codepoint < 0x10000) // U+0800...U+FFFF { if(0xD800 <= codepoint && codepoint <= 0xDFFF) { throw syntax_error(format_underline( "toml::read_utf8_codepoint: codepoints in the range " "[0xD800, 0xDFFF] are not valid UTF-8.", {{ source_location(loc), "not a valid UTF-8 codepoint" }}), source_location(loc)); } assert(codepoint < 0xD800 || 0xDFFF < codepoint); // 1110yyyy 10yxxxxx 10xxxxxx character += to_char(0xE0| codepoint >> 12); character += to_char(0x80|(codepoint >> 6 & 0x3F)); character += to_char(0x80|(codepoint & 0x3F)); } else if(codepoint < 0x110000) // U+010000 ... U+10FFFF { // 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx character += to_char(0xF0| codepoint >> 18); character += to_char(0x80|(codepoint >> 12 & 0x3F)); character += to_char(0x80|(codepoint >> 6 & 0x3F)); character += to_char(0x80|(codepoint & 0x3F)); } else // out of UTF-8 region { throw syntax_error(format_underline("toml::read_utf8_codepoint:" " input codepoint is too large.", {{source_location(loc), "should be in [0x00..0x10FFFF]"}}), source_location(loc)); } return character; } inline result parse_escape_sequence(location& loc) { const auto first = loc.iter(); if(first == loc.end() || *first != '\\') { return err(format_underline("toml::parse_escape_sequence: ", {{ source_location(loc), "the next token is not a backslash \"\\\""}})); } loc.advance(); switch(*loc.iter()) { case '\\':{loc.advance(); return ok(std::string("\\"));} case '"' :{loc.advance(); return ok(std::string("\""));} case 'b' :{loc.advance(); return ok(std::string("\b"));} case 't' :{loc.advance(); return ok(std::string("\t"));} case 'n' :{loc.advance(); return ok(std::string("\n"));} case 'f' :{loc.advance(); return ok(std::string("\f"));} case 'r' :{loc.advance(); return ok(std::string("\r"));} case 'u' : { if(const auto token = lex_escape_unicode_short::invoke(loc)) { return ok(read_utf8_codepoint(token.unwrap(), loc)); } else { return err(format_underline("parse_escape_sequence: " "invalid token found in UTF-8 codepoint uXXXX.", {{source_location(loc), "here"}})); } } case 'U': { if(const auto token = lex_escape_unicode_long::invoke(loc)) { return ok(read_utf8_codepoint(token.unwrap(), loc)); } else { return err(format_underline("parse_escape_sequence: " "invalid token found in UTF-8 codepoint Uxxxxxxxx", {{source_location(loc), "here"}})); } } } const auto msg = format_underline("parse_escape_sequence: " "unknown escape sequence appeared.", {{source_location(loc), "escape sequence is one of \\, \", b, t, n, f, r, uxxxx, Uxxxxxxxx"}}, /* Hints = */{"if you want to write backslash as just one backslash, " "use literal string like: regex = '<\\i\\c*\\s*>'"}); loc.reset(first); return err(msg); } inline std::ptrdiff_t check_utf8_validity(const std::string& reg) { location loc("tmp", reg); const auto u8 = repeat::invoke(loc); if(!u8 || loc.iter() != loc.end()) { const auto error_location = std::distance(loc.begin(), loc.iter()); assert(0 <= error_location); return error_location; } return -1; } inline result, std::string> parse_ml_basic_string(location& loc) { const auto first = loc.iter(); if(const auto token = lex_ml_basic_string::invoke(loc)) { auto inner_loc = loc; inner_loc.reset(first); std::string retval; retval.reserve(token.unwrap().size()); auto delim = lex_ml_basic_string_open::invoke(inner_loc); if(!delim) { throw internal_error(format_underline( "parse_ml_basic_string: invalid token", {{source_location(inner_loc), "should be \"\"\""}}), source_location(inner_loc)); } // immediate newline is ignored (if exists) /* discard return value */ lex_newline::invoke(inner_loc); delim = none(); while(!delim) { using lex_unescaped_seq = repeat< either, unlimited>; if(auto unescaped = lex_unescaped_seq::invoke(inner_loc)) { retval += unescaped.unwrap().str(); } if(auto escaped = parse_escape_sequence(inner_loc)) { retval += escaped.unwrap(); } if(auto esc_nl = lex_ml_basic_escaped_newline::invoke(inner_loc)) { // ignore newline after escape until next non-ws char } if(inner_loc.iter() == inner_loc.end()) { throw internal_error(format_underline( "parse_ml_basic_string: unexpected end of region", {{source_location(inner_loc), "not sufficient token"}}), source_location(inner_loc)); } delim = lex_ml_basic_string_close::invoke(inner_loc); } // `lex_ml_basic_string_close` allows 3 to 5 `"`s to allow 1 or 2 `"`s // at just before the delimiter. Here, we need to attach `"`s at the // end of the string body, if it exists. // For detail, see the definition of `lex_ml_basic_string_close`. assert(std::all_of(delim.unwrap().first(), delim.unwrap().last(), [](const char c) noexcept {return c == '\"';})); switch(delim.unwrap().size()) { case 3: {break;} case 4: {retval += "\""; break;} case 5: {retval += "\"\""; break;} default: { throw internal_error(format_underline( "parse_ml_basic_string: closing delimiter has invalid length", {{source_location(inner_loc), "end of this"}}), source_location(inner_loc)); } } const auto err_loc = check_utf8_validity(token.unwrap().str()); if(err_loc == -1) { return ok(std::make_pair(toml::string(retval), token.unwrap())); } else { inner_loc.reset(first); inner_loc.advance(err_loc); throw syntax_error(format_underline( "parse_ml_basic_string: invalid utf8 sequence found", {{source_location(inner_loc), "here"}}), source_location(inner_loc)); } } else { loc.reset(first); return err(format_underline("toml::parse_ml_basic_string: " "the next token is not a valid multiline string", {{source_location(loc), "here"}})); } } inline result, std::string> parse_basic_string(location& loc) { const auto first = loc.iter(); if(const auto token = lex_basic_string::invoke(loc)) { auto inner_loc = loc; inner_loc.reset(first); auto quot = lex_quotation_mark::invoke(inner_loc); if(!quot) { throw internal_error(format_underline("parse_basic_string: " "invalid token", {{source_location(inner_loc), "should be \""}}), source_location(inner_loc)); } std::string retval; retval.reserve(token.unwrap().size()); quot = none(); while(!quot) { using lex_unescaped_seq = repeat; if(auto unescaped = lex_unescaped_seq::invoke(inner_loc)) { retval += unescaped.unwrap().str(); } if(auto escaped = parse_escape_sequence(inner_loc)) { retval += escaped.unwrap(); } if(inner_loc.iter() == inner_loc.end()) { throw internal_error(format_underline( "parse_basic_string: unexpected end of region", {{source_location(inner_loc), "not sufficient token"}}), source_location(inner_loc)); } quot = lex_quotation_mark::invoke(inner_loc); } const auto err_loc = check_utf8_validity(token.unwrap().str()); if(err_loc == -1) { return ok(std::make_pair(toml::string(retval), token.unwrap())); } else { inner_loc.reset(first); inner_loc.advance(err_loc); throw syntax_error(format_underline( "parse_ml_basic_string: invalid utf8 sequence found", {{source_location(inner_loc), "here"}}), source_location(inner_loc)); } } else { loc.reset(first); // rollback return err(format_underline("toml::parse_basic_string: " "the next token is not a valid string", {{source_location(loc), "here"}})); } } inline result, std::string> parse_ml_literal_string(location& loc) { const auto first = loc.iter(); if(const auto token = lex_ml_literal_string::invoke(loc)) { location inner_loc(loc.name(), token.unwrap().str()); const auto open = lex_ml_literal_string_open::invoke(inner_loc); if(!open) { throw internal_error(format_underline( "parse_ml_literal_string: invalid token", {{source_location(inner_loc), "should be '''"}}), source_location(inner_loc)); } // immediate newline is ignored (if exists) /* discard return value */ lex_newline::invoke(inner_loc); const auto body = lex_ml_literal_body::invoke(inner_loc); const auto close = lex_ml_literal_string_close::invoke(inner_loc); if(!close) { throw internal_error(format_underline( "parse_ml_literal_string: invalid token", {{source_location(inner_loc), "should be '''"}}), source_location(inner_loc)); } // `lex_ml_literal_string_close` allows 3 to 5 `'`s to allow 1 or 2 `'`s // at just before the delimiter. Here, we need to attach `'`s at the // end of the string body, if it exists. // For detail, see the definition of `lex_ml_basic_string_close`. std::string retval = body.unwrap().str(); assert(std::all_of(close.unwrap().first(), close.unwrap().last(), [](const char c) noexcept {return c == '\'';})); switch(close.unwrap().size()) { case 3: {break;} case 4: {retval += "'"; break;} case 5: {retval += "''"; break;} default: { throw internal_error(format_underline( "parse_ml_literal_string: closing delimiter has invalid length", {{source_location(inner_loc), "end of this"}}), source_location(inner_loc)); } } const auto err_loc = check_utf8_validity(token.unwrap().str()); if(err_loc == -1) { return ok(std::make_pair(toml::string(retval, toml::string_t::literal), token.unwrap())); } else { inner_loc.reset(first); inner_loc.advance(err_loc); throw syntax_error(format_underline( "parse_ml_basic_string: invalid utf8 sequence found", {{source_location(inner_loc), "here"}}), source_location(inner_loc)); } } else { loc.reset(first); // rollback return err(format_underline("toml::parse_ml_literal_string: " "the next token is not a valid multiline literal string", {{source_location(loc), "here"}})); } } inline result, std::string> parse_literal_string(location& loc) { const auto first = loc.iter(); if(const auto token = lex_literal_string::invoke(loc)) { location inner_loc(loc.name(), token.unwrap().str()); const auto open = lex_apostrophe::invoke(inner_loc); if(!open) { throw internal_error(format_underline( "parse_literal_string: invalid token", {{source_location(inner_loc), "should be '"}}), source_location(inner_loc)); } const auto body = repeat::invoke(inner_loc); const auto close = lex_apostrophe::invoke(inner_loc); if(!close) { throw internal_error(format_underline( "parse_literal_string: invalid token", {{source_location(inner_loc), "should be '"}}), source_location(inner_loc)); } const auto err_loc = check_utf8_validity(token.unwrap().str()); if(err_loc == -1) { return ok(std::make_pair( toml::string(body.unwrap().str(), toml::string_t::literal), token.unwrap())); } else { inner_loc.reset(first); inner_loc.advance(err_loc); throw syntax_error(format_underline( "parse_ml_basic_string: invalid utf8 sequence found", {{source_location(inner_loc), "here"}}), source_location(inner_loc)); } } else { loc.reset(first); // rollback return err(format_underline("toml::parse_literal_string: " "the next token is not a valid literal string", {{source_location(loc), "here"}})); } } inline result, std::string> parse_string(location& loc) { if(loc.iter() != loc.end() && *(loc.iter()) == '"') { if(loc.iter() + 1 != loc.end() && *(loc.iter() + 1) == '"' && loc.iter() + 2 != loc.end() && *(loc.iter() + 2) == '"') { return parse_ml_basic_string(loc); } else { return parse_basic_string(loc); } } else if(loc.iter() != loc.end() && *(loc.iter()) == '\'') { if(loc.iter() + 1 != loc.end() && *(loc.iter() + 1) == '\'' && loc.iter() + 2 != loc.end() && *(loc.iter() + 2) == '\'') { return parse_ml_literal_string(loc); } else { return parse_literal_string(loc); } } return err(format_underline("toml::parse_string: ", {{source_location(loc), "the next token is not a string"}})); } inline result, std::string> parse_local_date(location& loc) { const auto first = loc.iter(); if(const auto token = lex_local_date::invoke(loc)) { location inner_loc(loc.name(), token.unwrap().str()); const auto y = lex_date_fullyear::invoke(inner_loc); if(!y || inner_loc.iter() == inner_loc.end() || *inner_loc.iter() != '-') { throw internal_error(format_underline( "toml::parse_inner_local_date: invalid year format", {{source_location(inner_loc), "should be `-`"}}), source_location(inner_loc)); } inner_loc.advance(); const auto m = lex_date_month::invoke(inner_loc); if(!m || inner_loc.iter() == inner_loc.end() || *inner_loc.iter() != '-') { throw internal_error(format_underline( "toml::parse_local_date: invalid month format", {{source_location(inner_loc), "should be `-`"}}), source_location(inner_loc)); } inner_loc.advance(); const auto d = lex_date_mday::invoke(inner_loc); if(!d) { throw internal_error(format_underline( "toml::parse_local_date: invalid day format", {{source_location(inner_loc), "here"}}), source_location(inner_loc)); } const auto year = static_cast(from_string(y.unwrap().str(), 0)); const auto month = static_cast(from_string(m.unwrap().str(), 0)); const auto day = static_cast(from_string(d.unwrap().str(), 0)); // We briefly check whether the input date is valid or not. But here, we // only check if the RFC3339 compliance. // Actually there are several special date that does not exist, // because of historical reasons, such as 1582/10/5-1582/10/14 (only in // several countries). But here, we do not care about such a complicated // rule. It makes the code complicated and there is only low probability // that such a specific date is needed in practice. If someone need to // validate date accurately, that means that the one need a specialized // library for their purpose in a different layer. { const bool is_leap = (year % 4 == 0) && ((year % 100 != 0) || (year % 400 == 0)); const auto max_day = (month == 2) ? (is_leap ? 29 : 28) : ((month == 4 || month == 6 || month == 9 || month == 11) ? 30 : 31); if((month < 1 || 12 < month) || (day < 1 || max_day < day)) { throw syntax_error(format_underline("toml::parse_date: " "invalid date: it does not conform RFC3339.", {{ source_location(loc), "month should be 01-12, day should be" " 01-28,29,30,31, depending on month/year." }}), source_location(inner_loc)); } } return ok(std::make_pair(local_date(year, static_cast(month - 1), day), token.unwrap())); } else { loc.reset(first); return err(format_underline("toml::parse_local_date: ", {{source_location(loc), "the next token is not a local_date"}})); } } inline result, std::string> parse_local_time(location& loc) { const auto first = loc.iter(); if(const auto token = lex_local_time::invoke(loc)) { location inner_loc(loc.name(), token.unwrap().str()); const auto h = lex_time_hour::invoke(inner_loc); if(!h || inner_loc.iter() == inner_loc.end() || *inner_loc.iter() != ':') { throw internal_error(format_underline( "toml::parse_local_time: invalid year format", {{source_location(inner_loc), "should be `:`"}}), source_location(inner_loc)); } inner_loc.advance(); const auto m = lex_time_minute::invoke(inner_loc); if(!m || inner_loc.iter() == inner_loc.end() || *inner_loc.iter() != ':') { throw internal_error(format_underline( "toml::parse_local_time: invalid month format", {{source_location(inner_loc), "should be `:`"}}), source_location(inner_loc)); } inner_loc.advance(); const auto s = lex_time_second::invoke(inner_loc); if(!s) { throw internal_error(format_underline( "toml::parse_local_time: invalid second format", {{source_location(inner_loc), "here"}}), source_location(inner_loc)); } const int hour = from_string(h.unwrap().str(), 0); const int minute = from_string(m.unwrap().str(), 0); const int second = from_string(s.unwrap().str(), 0); if((hour < 0 || 23 < hour) || (minute < 0 || 59 < minute) || (second < 0 || 60 < second)) // it may be leap second { throw syntax_error(format_underline("toml::parse_time: " "invalid time: it does not conform RFC3339.", {{ source_location(loc), "hour should be 00-23, minute should be" " 00-59, second should be 00-60 (depending on the leap" " second rules.)"}}), source_location(inner_loc)); } local_time time(hour, minute, second, 0, 0); const auto before_secfrac = inner_loc.iter(); if(const auto secfrac = lex_time_secfrac::invoke(inner_loc)) { auto sf = secfrac.unwrap().str(); sf.erase(sf.begin()); // sf.front() == '.' switch(sf.size() % 3) { case 2: sf += '0'; break; case 1: sf += "00"; break; case 0: break; default: break; } if(sf.size() >= 9) { time.millisecond = from_string(sf.substr(0, 3), 0u); time.microsecond = from_string(sf.substr(3, 3), 0u); time.nanosecond = from_string(sf.substr(6, 3), 0u); } else if(sf.size() >= 6) { time.millisecond = from_string(sf.substr(0, 3), 0u); time.microsecond = from_string(sf.substr(3, 3), 0u); } else if(sf.size() >= 3) { time.millisecond = from_string(sf, 0u); time.microsecond = 0u; } } else { if(before_secfrac != inner_loc.iter()) { throw internal_error(format_underline( "toml::parse_local_time: invalid subsecond format", {{source_location(inner_loc), "here"}}), source_location(inner_loc)); } } return ok(std::make_pair(time, token.unwrap())); } else { loc.reset(first); return err(format_underline("toml::parse_local_time: ", {{source_location(loc), "the next token is not a local_time"}})); } } inline result, std::string> parse_local_datetime(location& loc) { const auto first = loc.iter(); if(const auto token = lex_local_date_time::invoke(loc)) { location inner_loc(loc.name(), token.unwrap().str()); const auto date = parse_local_date(inner_loc); if(!date || inner_loc.iter() == inner_loc.end()) { throw internal_error(format_underline( "toml::parse_local_datetime: invalid datetime format", {{source_location(inner_loc), "date, not datetime"}}), source_location(inner_loc)); } const char delim = *(inner_loc.iter()); if(delim != 'T' && delim != 't' && delim != ' ') { throw internal_error(format_underline( "toml::parse_local_datetime: invalid datetime format", {{source_location(inner_loc), "should be `T` or ` ` (space)"}}), source_location(inner_loc)); } inner_loc.advance(); const auto time = parse_local_time(inner_loc); if(!time) { throw internal_error(format_underline( "toml::parse_local_datetime: invalid datetime format", {{source_location(inner_loc), "invalid time format"}}), source_location(inner_loc)); } return ok(std::make_pair( local_datetime(date.unwrap().first, time.unwrap().first), token.unwrap())); } else { loc.reset(first); return err(format_underline("toml::parse_local_datetime: ", {{source_location(loc), "the next token is not a local_datetime"}})); } } inline result, std::string> parse_offset_datetime(location& loc) { const auto first = loc.iter(); if(const auto token = lex_offset_date_time::invoke(loc)) { location inner_loc(loc.name(), token.unwrap().str()); const auto datetime = parse_local_datetime(inner_loc); if(!datetime || inner_loc.iter() == inner_loc.end()) { throw internal_error(format_underline( "toml::parse_offset_datetime: invalid datetime format", {{source_location(inner_loc), "date, not datetime"}}), source_location(inner_loc)); } time_offset offset(0, 0); if(const auto ofs = lex_time_numoffset::invoke(inner_loc)) { const auto str = ofs.unwrap().str(); const auto hour = from_string(str.substr(1,2), 0); const auto minute = from_string(str.substr(4,2), 0); if((hour < 0 || 23 < hour) || (minute < 0 || 59 < minute)) { throw syntax_error(format_underline("toml::parse_offset_datetime: " "invalid offset: it does not conform RFC3339.", {{ source_location(loc), "month should be 01-12, day should be" " 01-28,29,30,31, depending on month/year." }}), source_location(inner_loc)); } if(str.front() == '+') { offset = time_offset(hour, minute); } else { offset = time_offset(-hour, -minute); } } else if(*inner_loc.iter() != 'Z' && *inner_loc.iter() != 'z') { throw internal_error(format_underline( "toml::parse_offset_datetime: invalid datetime format", {{source_location(inner_loc), "should be `Z` or `+HH:MM`"}}), source_location(inner_loc)); } return ok(std::make_pair(offset_datetime(datetime.unwrap().first, offset), token.unwrap())); } else { loc.reset(first); return err(format_underline("toml::parse_offset_datetime: ", {{source_location(loc), "the next token is not a offset_datetime"}})); } } inline result, std::string> parse_simple_key(location& loc) { if(const auto bstr = parse_basic_string(loc)) { return ok(std::make_pair(bstr.unwrap().first.str, bstr.unwrap().second)); } if(const auto lstr = parse_literal_string(loc)) { return ok(std::make_pair(lstr.unwrap().first.str, lstr.unwrap().second)); } if(const auto bare = lex_unquoted_key::invoke(loc)) { const auto reg = bare.unwrap(); return ok(std::make_pair(reg.str(), reg)); } return err(format_underline("toml::parse_simple_key: ", {{source_location(loc), "the next token is not a simple key"}})); } // dotted key become vector of keys inline result, region>, std::string> parse_key(location& loc) { const auto first = loc.iter(); // dotted key -> `foo.bar.baz` where several single keys are chained by // dots. Whitespaces between keys and dots are allowed. if(const auto token = lex_dotted_key::invoke(loc)) { const auto reg = token.unwrap(); location inner_loc(loc.name(), reg.str()); std::vector keys; while(inner_loc.iter() != inner_loc.end()) { lex_ws::invoke(inner_loc); if(const auto k = parse_simple_key(inner_loc)) { keys.push_back(k.unwrap().first); } else { throw internal_error(format_underline( "toml::detail::parse_key: dotted key contains invalid key", {{source_location(inner_loc), k.unwrap_err()}}), source_location(inner_loc)); } lex_ws::invoke(inner_loc); if(inner_loc.iter() == inner_loc.end()) { break; } else if(*inner_loc.iter() == '.') { inner_loc.advance(); // to skip `.` } else { throw internal_error(format_underline("toml::parse_key: " "dotted key contains invalid key ", {{source_location(inner_loc), "should be `.`"}}), source_location(inner_loc)); } } return ok(std::make_pair(keys, reg)); } loc.reset(first); // simple_key: a single (basic_string|literal_string|bare key) if(const auto smpl = parse_simple_key(loc)) { return ok(std::make_pair(std::vector(1, smpl.unwrap().first), smpl.unwrap().second)); } return err(format_underline("toml::parse_key: an invalid key appeared.", {{source_location(loc), "is not a valid key"}}, { "bare keys : non-empty strings composed only of [A-Za-z0-9_-].", "quoted keys: same as \"basic strings\" or 'literal strings'.", "dotted keys: sequence of bare or quoted keys joined with a dot." })); } // forward-decl to implement parse_array and parse_table template result parse_value(location&); template result, std::string> parse_array(location& loc) { using value_type = Value; using array_type = typename value_type::array_type; const auto first = loc.iter(); if(loc.iter() == loc.end()) { return err("toml::parse_array: input is empty"); } if(*loc.iter() != '[') { return err("toml::parse_array: token is not an array"); } loc.advance(); using lex_ws_comment_newline = repeat< either, unlimited>; array_type retval; while(loc.iter() != loc.end()) { lex_ws_comment_newline::invoke(loc); // skip if(loc.iter() != loc.end() && *loc.iter() == ']') { loc.advance(); // skip ']' return ok(std::make_pair(retval, region(loc, first, loc.iter()))); } if(auto val = parse_value(loc)) { // After TOML v1.0.0-rc.1, array becomes to be able to have values // with different types. So here we will omit this by default. // // But some of the test-suite checks if the parser accepts a hetero- // geneous arrays, so we keep this for a while. #ifdef TOML11_DISALLOW_HETEROGENEOUS_ARRAYS if(!retval.empty() && retval.front().type() != val.as_ok().type()) { auto array_start_loc = loc; array_start_loc.reset(first); throw syntax_error(format_underline("toml::parse_array: " "type of elements should be the same each other.", { {source_location(array_start_loc), "array starts here"}, { retval.front().location(), "value has type " + stringize(retval.front().type()) }, { val.unwrap().location(), "value has different type, " + stringize(val.unwrap().type()) } }), source_location(loc)); } #endif retval.push_back(std::move(val.unwrap())); } else { auto array_start_loc = loc; array_start_loc.reset(first); throw syntax_error(format_underline("toml::parse_array: " "value having invalid format appeared in an array", { {source_location(array_start_loc), "array starts here"}, {source_location(loc), "it is not a valid value."} }), source_location(loc)); } using lex_array_separator = sequence, character<','>>; const auto sp = lex_array_separator::invoke(loc); if(!sp) { lex_ws_comment_newline::invoke(loc); if(loc.iter() != loc.end() && *loc.iter() == ']') { loc.advance(); // skip ']' return ok(std::make_pair(retval, region(loc, first, loc.iter()))); } else { auto array_start_loc = loc; array_start_loc.reset(first); throw syntax_error(format_underline("toml::parse_array:" " missing array separator `,` after a value", { {source_location(array_start_loc), "array starts here"}, {source_location(loc), "should be `,`"} }), source_location(loc)); } } } loc.reset(first); throw syntax_error(format_underline("toml::parse_array: " "array did not closed by `]`", {{source_location(loc), "should be closed"}}), source_location(loc)); } template result, region>, Value>, std::string> parse_key_value_pair(location& loc) { using value_type = Value; const auto first = loc.iter(); auto key_reg = parse_key(loc); if(!key_reg) { std::string msg = std::move(key_reg.unwrap_err()); // if the next token is keyvalue-separator, it means that there are no // key. then we need to show error as "empty key is not allowed". if(const auto keyval_sep = lex_keyval_sep::invoke(loc)) { loc.reset(first); msg = format_underline("toml::parse_key_value_pair: " "empty key is not allowed.", {{source_location(loc), "key expected before '='"}}); } return err(std::move(msg)); } const auto kvsp = lex_keyval_sep::invoke(loc); if(!kvsp) { std::string msg; // if the line contains '=' after the invalid sequence, possibly the // error is in the key (like, invalid character in bare key). const auto line_end = std::find(loc.iter(), loc.end(), '\n'); if(std::find(loc.iter(), line_end, '=') != line_end) { msg = format_underline("toml::parse_key_value_pair: " "invalid format for key", {{source_location(loc), "invalid character in key"}}, {"Did you forget '.' to separate dotted-key?", "Allowed characters for bare key are [0-9a-zA-Z_-]."}); } else // if not, the error is lack of key-value separator. { msg = format_underline("toml::parse_key_value_pair: " "missing key-value separator `=`", {{source_location(loc), "should be `=`"}}); } loc.reset(first); return err(std::move(msg)); } const auto after_kvsp = loc.iter(); // err msg auto val = parse_value(loc); if(!val) { std::string msg; loc.reset(after_kvsp); // check there is something not a comment/whitespace after `=` if(sequence, maybe, lex_newline>::invoke(loc)) { loc.reset(after_kvsp); msg = format_underline("toml::parse_key_value_pair: " "missing value after key-value separator '='", {{source_location(loc), "expected value, but got nothing"}}); } else // there is something not a comment/whitespace, so invalid format. { msg = std::move(val.unwrap_err()); } loc.reset(first); return err(msg); } return ok(std::make_pair(std::move(key_reg.unwrap()), std::move(val.unwrap()))); } // for error messages. template std::string format_dotted_keys(InputIterator first, const InputIterator last) { static_assert(std::is_same::value_type>::value,""); std::string retval(*first++); for(; first != last; ++first) { retval += '.'; retval += *first; } return retval; } // forward decl for is_valid_forward_table_definition result, region>, std::string> parse_table_key(location& loc); template result, std::string> parse_inline_table(location& loc); // The following toml file is allowed. // ```toml // [a.b.c] # here, table `a` has element `b`. // foo = "bar" // [a] # merge a = {baz = "qux"} to a = {b = {...}} // baz = "qux" // ``` // But the following is not allowed. // ```toml // [a] // b.c.foo = "bar" // [a] # error! the same table [a] defined! // baz = "qux" // ``` // The following is neither allowed. // ```toml // a = { b.c.foo = "bar"} // [a] # error! the same table [a] defined! // baz = "qux" // ``` // Here, it parses region of `tab->at(k)` as a table key and check the depth // of the key. If the key region points deeper node, it would be allowed. // Otherwise, the key points the same node. It would be rejected. template bool is_valid_forward_table_definition(const Value& fwd, const Value& inserting, Iterator key_first, Iterator key_curr, Iterator key_last) { // ------------------------------------------------------------------------ // check type of the value to be inserted/merged std::string inserting_reg = ""; if(const auto ptr = detail::get_region(inserting)) { inserting_reg = ptr->str(); } location inserting_def("internal", std::move(inserting_reg)); if(const auto inlinetable = parse_inline_table(inserting_def)) { // check if we are overwriting existing table. // ```toml // # NG // a.b = 42 // a = {d = 3.14} // ``` // Inserting an inline table to a existing super-table is not allowed in // any case. If we found it, we can reject it without further checking. return false; } // ------------------------------------------------------------------------ // check table defined before std::string internal = ""; if(const auto ptr = detail::get_region(fwd)) { internal = ptr->str(); } location def("internal", std::move(internal)); if(const auto tabkeys = parse_table_key(def)) // [table.key] { // table keys always contains all the nodes from the root. const auto& tks = tabkeys.unwrap().first; if(std::size_t(std::distance(key_first, key_last)) == tks.size() && std::equal(tks.begin(), tks.end(), key_first)) { // the keys are equivalent. it is not allowed. return false; } // the keys are not equivalent. it is allowed. return true; } if(const auto dotkeys = parse_key(def)) { // consider the following case. // [a] // b.c = {d = 42} // [a.b.c] // e = 2.71 // this defines the table [a.b.c] twice. no? // a dotted key starts from the node representing a table in which the // dotted key belongs to. const auto& dks = dotkeys.unwrap().first; if(std::size_t(std::distance(key_curr, key_last)) == dks.size() && std::equal(dks.begin(), dks.end(), key_curr)) { // the keys are equivalent. it is not allowed. return false; } // the keys are not equivalent. it is allowed. return true; } return false; } template result insert_nested_key(typename Value::table_type& root, const Value& v, InputIterator iter, const InputIterator last, region key_reg, const bool is_array_of_table = false) { static_assert(std::is_same::value_type>::value,""); using value_type = Value; using table_type = typename value_type::table_type; using array_type = typename value_type::array_type; const auto first = iter; assert(iter != last); table_type* tab = std::addressof(root); for(; iter != last; ++iter) // search recursively { const key& k = *iter; if(std::next(iter) == last) // k is the last key { // XXX if the value is array-of-tables, there can be several // tables that are in the same array. in that case, we need to // find the last element and insert it to there. if(is_array_of_table) { if(tab->count(k) == 1) // there is already an array of table { if(tab->at(k).is_table()) { // show special err msg for conflicting table throw syntax_error(format_underline(concat_to_string( "toml::insert_value: array of table (\"", format_dotted_keys(first, last), "\") cannot be defined"), { {tab->at(k).location(), "table already defined"}, {v.location(), "this conflicts with the previous table"} }), v.location()); } else if(!(tab->at(k).is_array())) { throw syntax_error(format_underline(concat_to_string( "toml::insert_value: array of table (\"", format_dotted_keys(first, last), "\") collides with" " existing value"), { {tab->at(k).location(), concat_to_string("this ", tab->at(k).type(), " value already exists")}, {v.location(), "while inserting this array-of-tables"} }), v.location()); } // the above if-else-if checks tab->at(k) is an array auto& a = tab->at(k).as_array(); // If table element is defined as [[array_of_tables]], it // cannot be an empty array. If an array of tables is // defined as `aot = []`, it cannot be appended. if(a.empty() || !(a.front().is_table())) { throw syntax_error(format_underline(concat_to_string( "toml::insert_value: array of table (\"", format_dotted_keys(first, last), "\") collides with" " existing value"), { {tab->at(k).location(), concat_to_string("this ", tab->at(k).type(), " value already exists")}, {v.location(), "while inserting this array-of-tables"} }), v.location()); } // avoid conflicting array of table like the following. // ```toml // a = [{b = 42}] # define a as an array of *inline* tables // [[a]] # a is an array of *multi-line* tables // b = 54 // ``` // Here, from the type information, these cannot be detected // because inline table is also a table. // But toml v0.5.0 explicitly says it is invalid. The above // array-of-tables has a static size and appending to the // array is invalid. // In this library, multi-line table value has a region // that points to the key of the table (e.g. [[a]]). By // comparing the first two letters in key, we can detect // the array-of-table is inline or multiline. if(const auto ptr = detail::get_region(a.front())) { if(ptr->str().substr(0,2) != "[[") { throw syntax_error(format_underline(concat_to_string( "toml::insert_value: array of table (\"", format_dotted_keys(first, last), "\") collides " "with existing array-of-tables"), { {tab->at(k).location(), concat_to_string("this ", tab->at(k).type(), " value has static size")}, {v.location(), "appending it to the statically sized array"} }), v.location()); } } a.push_back(v); return ok(true); } else // if not, we need to create the array of table { // XXX: Consider the following array of tables. // ```toml // # This is a comment. // [[aot]] // foo = "bar" // ``` // Here, the comment is for `aot`. But here, actually two // values are defined. An array that contains tables, named // `aot`, and the 0th element of the `aot`, `{foo = "bar"}`. // Those two are different from each other. But both of them // points to the same portion of the TOML file, `[[aot]]`, // so `key_reg.comments()` returns `# This is a comment`. // If it is assigned as a comment of `aot` defined here, the // comment will be duplicated. Both the `aot` itself and // the 0-th element will have the same comment. This causes // "duplication of the same comments" bug when the data is // serialized. // Next, consider the following. // ```toml // # comment 1 // aot = [ // # comment 2 // {foo = "bar"}, // ] // ``` // In this case, we can distinguish those two comments. So // here we need to add "comment 1" to the `aot` and // "comment 2" to the 0th element of that. // To distinguish those two, we check the key region. std::vector comments{/* empty by default */}; if(key_reg.str().substr(0, 2) != "[[") { comments = key_reg.comments(); } value_type aot(array_type(1, v), key_reg, std::move(comments)); tab->insert(std::make_pair(k, aot)); return ok(true); } } // end if(array of table) if(tab->count(k) == 1) { if(tab->at(k).is_table() && v.is_table()) { if(!is_valid_forward_table_definition( tab->at(k), v, first, iter, last)) { throw syntax_error(format_underline(concat_to_string( "toml::insert_value: table (\"", format_dotted_keys(first, last), "\") already exists."), { {tab->at(k).location(), "table already exists here"}, {v.location(), "table defined twice"} }), v.location()); } // to allow the following toml file. // [a.b.c] // d = 42 // [a] // e = 2.71 auto& t = tab->at(k).as_table(); for(const auto& kv : v.as_table()) { if(tab->at(k).contains(kv.first)) { throw syntax_error(format_underline(concat_to_string( "toml::insert_value: value (\"", format_dotted_keys(first, last), "\") already exists."), { {t.at(kv.first).location(), "already exists here"}, {v.location(), "this defined twice"} }), v.location()); } t[kv.first] = kv.second; } detail::change_region(tab->at(k), key_reg); return ok(true); } else if(v.is_table() && tab->at(k).is_array() && tab->at(k).as_array().size() > 0 && tab->at(k).as_array().front().is_table()) { throw syntax_error(format_underline(concat_to_string( "toml::insert_value: array of tables (\"", format_dotted_keys(first, last), "\") already exists."), { {tab->at(k).location(), "array of tables defined here"}, {v.location(), "table conflicts with the previous array of table"} }), v.location()); } else { throw syntax_error(format_underline(concat_to_string( "toml::insert_value: value (\"", format_dotted_keys(first, last), "\") already exists."), { {tab->at(k).location(), "value already exists here"}, {v.location(), "value defined twice"} }), v.location()); } } tab->insert(std::make_pair(k, v)); return ok(true); } else // k is not the last one, we should insert recursively { // if there is no corresponding value, insert it first. // related: you don't need to write // # [x] // # [x.y] // to write // [x.y.z] if(tab->count(k) == 0) { // a table that is defined implicitly doesn't have any comments. (*tab)[k] = value_type(table_type{}, key_reg, {/*no comment*/}); } // type checking... if(tab->at(k).is_table()) { // According to toml-lang/toml:36d3091b3 "Clarify that inline // tables are immutable", check if it adds key-value pair to an // inline table. if(const auto* ptr = get_region(tab->at(k))) { // here, if the value is a (multi-line) table, the region // should be something like `[table-name]`. if(ptr->front() == '{') { throw syntax_error(format_underline(concat_to_string( "toml::insert_value: inserting to an inline table (", format_dotted_keys(first, std::next(iter)), ") but inline tables are immutable"), { {tab->at(k).location(), "inline tables are immutable"}, {v.location(), "inserting this"} }), v.location()); } } tab = std::addressof((*tab)[k].as_table()); } else if(tab->at(k).is_array()) // inserting to array-of-tables? { auto& a = (*tab)[k].as_array(); if(!a.back().is_table()) { throw syntax_error(format_underline(concat_to_string( "toml::insert_value: target (", format_dotted_keys(first, std::next(iter)), ") is neither table nor an array of tables"), { {a.back().location(), concat_to_string( "actual type is ", a.back().type())}, {v.location(), "inserting this"} }), v.location()); } tab = std::addressof(a.back().as_table()); } else { throw syntax_error(format_underline(concat_to_string( "toml::insert_value: target (", format_dotted_keys(first, std::next(iter)), ") is neither table nor an array of tables"), { {tab->at(k).location(), concat_to_string( "actual type is ", tab->at(k).type())}, {v.location(), "inserting this"} }), v.location()); } } } return err(std::string("toml::detail::insert_nested_key: never reach here")); } template result, std::string> parse_inline_table(location& loc) { using value_type = Value; using table_type = typename value_type::table_type; const auto first = loc.iter(); table_type retval; if(!(loc.iter() != loc.end() && *loc.iter() == '{')) { return err(format_underline("toml::parse_inline_table: ", {{source_location(loc), "the next token is not an inline table"}})); } loc.advance(); // check if the inline table is an empty table = { } maybe::invoke(loc); if(loc.iter() != loc.end() && *loc.iter() == '}') { loc.advance(); // skip `}` return ok(std::make_pair(retval, region(loc, first, loc.iter()))); } // it starts from "{". it should be formatted as inline-table while(loc.iter() != loc.end()) { const auto kv_r = parse_key_value_pair(loc); if(!kv_r) { return err(kv_r.unwrap_err()); } const auto& kvpair = kv_r.unwrap(); const std::vector& keys = kvpair.first.first; const auto& key_reg = kvpair.first.second; const value_type& val = kvpair.second; const auto inserted = insert_nested_key(retval, val, keys.begin(), keys.end(), key_reg); if(!inserted) { throw internal_error("toml::parse_inline_table: " "failed to insert value into table: " + inserted.unwrap_err(), source_location(loc)); } using lex_table_separator = sequence, character<','>>; const auto sp = lex_table_separator::invoke(loc); if(!sp) { maybe::invoke(loc); if(loc.iter() == loc.end()) { throw syntax_error(format_underline( "toml::parse_inline_table: missing table separator `}` ", {{source_location(loc), "should be `}`"}}), source_location(loc)); } else if(*loc.iter() == '}') { loc.advance(); // skip `}` return ok(std::make_pair( retval, region(loc, first, loc.iter()))); } else if(*loc.iter() == '#' || *loc.iter() == '\r' || *loc.iter() == '\n') { throw syntax_error(format_underline( "toml::parse_inline_table: missing curly brace `}`", {{source_location(loc), "should be `}`"}}), source_location(loc)); } else { throw syntax_error(format_underline( "toml::parse_inline_table: missing table separator `,` ", {{source_location(loc), "should be `,`"}}), source_location(loc)); } } else // `,` is found { maybe::invoke(loc); if(loc.iter() != loc.end() && *loc.iter() == '}') { throw syntax_error(format_underline( "toml::parse_inline_table: trailing comma is not allowed in" " an inline table", {{source_location(loc), "should be `}`"}}), source_location(loc)); } } } loc.reset(first); throw syntax_error(format_underline("toml::parse_inline_table: " "inline table did not closed by `}`", {{source_location(loc), "should be closed"}}), source_location(loc)); } inline result guess_number_type(const location& l) { // This function tries to find some (common) mistakes by checking characters // that follows the last character of a value. But it is often difficult // because some non-newline characters can appear after a value. E.g. // spaces, tabs, commas (in an array or inline table), closing brackets // (of an array or inline table), comment-sign (#). Since this function // does not parse further, those characters are always allowed to be there. location loc = l; if(lex_offset_date_time::invoke(loc)) {return ok(value_t::offset_datetime);} loc.reset(l.iter()); if(lex_local_date_time::invoke(loc)) { // bad offset may appear after this. if(loc.iter() != loc.end() && (*loc.iter() == '+' || *loc.iter() == '-' || *loc.iter() == 'Z' || *loc.iter() == 'z')) { return err(format_underline("bad offset: should be [+-]HH:MM or Z", {{source_location(loc), "[+-]HH:MM or Z"}}, {"pass: +09:00, -05:30", "fail: +9:00, -5:30"})); } return ok(value_t::local_datetime); } loc.reset(l.iter()); if(lex_local_date::invoke(loc)) { // bad time may appear after this. // A space is allowed as a delimiter between local time. But there are // both cases in which a space becomes valid or invalid. // - invalid: 2019-06-16 7:00:00 // - valid : 2019-06-16 07:00:00 if(loc.iter() != loc.end()) { const auto c = *loc.iter(); if(c == 'T' || c == 't') { return err(format_underline("bad time: should be HH:MM:SS.subsec", {{source_location(loc), "HH:MM:SS.subsec"}}, {"pass: 1979-05-27T07:32:00, 1979-05-27 07:32:00.999999", "fail: 1979-05-27T7:32:00, 1979-05-27 17:32"})); } if('0' <= c && c <= '9') { return err(format_underline("bad time: missing T", {{source_location(loc), "T or space required here"}}, {"pass: 1979-05-27T07:32:00, 1979-05-27 07:32:00.999999", "fail: 1979-05-27T7:32:00, 1979-05-27 7:32"})); } if(c == ' ' && std::next(loc.iter()) != loc.end() && ('0' <= *std::next(loc.iter()) && *std::next(loc.iter())<= '9')) { loc.advance(); return err(format_underline("bad time: should be HH:MM:SS.subsec", {{source_location(loc), "HH:MM:SS.subsec"}}, {"pass: 1979-05-27T07:32:00, 1979-05-27 07:32:00.999999", "fail: 1979-05-27T7:32:00, 1979-05-27 7:32"})); } } return ok(value_t::local_date); } loc.reset(l.iter()); if(lex_local_time::invoke(loc)) {return ok(value_t::local_time);} loc.reset(l.iter()); if(lex_float::invoke(loc)) { if(loc.iter() != loc.end() && *loc.iter() == '_') { return err(format_underline("bad float: `_` should be surrounded by digits", {{source_location(loc), "here"}}, {"pass: +1.0, -2e-2, 3.141_592_653_589, inf, nan", "fail: .0, 1., _1.0, 1.0_, 1_.0, 1.0__0"})); } return ok(value_t::floating); } loc.reset(l.iter()); if(lex_integer::invoke(loc)) { if(loc.iter() != loc.end()) { const auto c = *loc.iter(); if(c == '_') { return err(format_underline("bad integer: `_` should be surrounded by digits", {{source_location(loc), "here"}}, {"pass: -42, 1_000, 1_2_3_4_5, 0xC0FFEE, 0b0010, 0o755", "fail: 1__000, 0123"})); } if('0' <= c && c <= '9') { // leading zero. point '0' loc.retrace(); return err(format_underline("bad integer: leading zero", {{source_location(loc), "here"}}, {"pass: -42, 1_000, 1_2_3_4_5, 0xC0FFEE, 0b0010, 0o755", "fail: 1__000, 0123"})); } if(c == ':' || c == '-') { return err(format_underline("bad datetime: invalid format", {{source_location(loc), "here"}}, {"pass: 1979-05-27T07:32:00-07:00, 1979-05-27 07:32:00.999999Z", "fail: 1979-05-27T7:32:00-7:00, 1979-05-27 7:32-00:30"})); } if(c == '.' || c == 'e' || c == 'E') { return err(format_underline("bad float: invalid format", {{source_location(loc), "here"}}, {"pass: +1.0, -2e-2, 3.141_592_653_589, inf, nan", "fail: .0, 1., _1.0, 1.0_, 1_.0, 1.0__0"})); } } return ok(value_t::integer); } if(loc.iter() != loc.end() && *loc.iter() == '.') { return err(format_underline("bad float: invalid format", {{source_location(loc), "integer part required before this"}}, {"pass: +1.0, -2e-2, 3.141_592_653_589, inf, nan", "fail: .0, 1., _1.0, 1.0_, 1_.0, 1.0__0"})); } if(loc.iter() != loc.end() && *loc.iter() == '_') { return err(format_underline("bad number: `_` should be surrounded by digits", {{source_location(loc), "`_` is not surrounded by digits"}}, {"pass: -42, 1_000, 1_2_3_4_5, 0xC0FFEE, 0b0010, 0o755", "fail: 1__000, 0123"})); } return err(format_underline("bad format: unknown value appeared", {{source_location(loc), "here"}})); } inline result guess_value_type(const location& loc) { switch(*loc.iter()) { case '"' : {return ok(value_t::string); } case '\'': {return ok(value_t::string); } case 't' : {return ok(value_t::boolean); } case 'f' : {return ok(value_t::boolean); } case '[' : {return ok(value_t::array); } case '{' : {return ok(value_t::table); } case 'i' : {return ok(value_t::floating);} // inf. case 'n' : {return ok(value_t::floating);} // nan. default : {return guess_number_type(loc);} } } template result parse_value_helper(result, std::string> rslt) { if(rslt.is_ok()) { auto comments = rslt.as_ok().second.comments(); return ok(Value(std::move(rslt.as_ok()), std::move(comments))); } else { return err(std::move(rslt.as_err())); } } template result parse_value(location& loc) { const auto first = loc.iter(); if(first == loc.end()) { return err(format_underline("toml::parse_value: input is empty", {{source_location(loc), ""}})); } const auto type = guess_value_type(loc); if(!type) { return err(type.unwrap_err()); } switch(type.unwrap()) { case value_t::boolean : {return parse_value_helper(parse_boolean(loc) );} case value_t::integer : {return parse_value_helper(parse_integer(loc) );} case value_t::floating : {return parse_value_helper(parse_floating(loc) );} case value_t::string : {return parse_value_helper(parse_string(loc) );} case value_t::offset_datetime: {return parse_value_helper(parse_offset_datetime(loc) );} case value_t::local_datetime : {return parse_value_helper(parse_local_datetime(loc) );} case value_t::local_date : {return parse_value_helper(parse_local_date(loc) );} case value_t::local_time : {return parse_value_helper(parse_local_time(loc) );} case value_t::array : {return parse_value_helper(parse_array(loc) );} case value_t::table : {return parse_value_helper(parse_inline_table(loc));} default: { const auto msg = format_underline("toml::parse_value: " "unknown token appeared", {{source_location(loc), "unknown"}}); loc.reset(first); return err(msg); } } } inline result, region>, std::string> parse_table_key(location& loc) { if(auto token = lex_std_table::invoke(loc)) { location inner_loc(loc.name(), token.unwrap().str()); const auto open = lex_std_table_open::invoke(inner_loc); if(!open || inner_loc.iter() == inner_loc.end()) { throw internal_error(format_underline( "toml::parse_table_key: no `[`", {{source_location(inner_loc), "should be `[`"}}), source_location(inner_loc)); } // to skip [ a . b . c ] // ^----------- this whitespace lex_ws::invoke(inner_loc); const auto keys = parse_key(inner_loc); if(!keys) { throw internal_error(format_underline( "toml::parse_table_key: invalid key", {{source_location(inner_loc), "not key"}}), source_location(inner_loc)); } // to skip [ a . b . c ] // ^-- this whitespace lex_ws::invoke(inner_loc); const auto close = lex_std_table_close::invoke(inner_loc); if(!close) { throw internal_error(format_underline( "toml::parse_table_key: no `]`", {{source_location(inner_loc), "should be `]`"}}), source_location(inner_loc)); } // after [table.key], newline or EOF(empty table) required. if(loc.iter() != loc.end()) { using lex_newline_after_table_key = sequence, maybe, lex_newline>; const auto nl = lex_newline_after_table_key::invoke(loc); if(!nl) { throw syntax_error(format_underline( "toml::parse_table_key: newline required after [table.key]", {{source_location(loc), "expected newline"}}), source_location(loc)); } } return ok(std::make_pair(keys.unwrap().first, token.unwrap())); } else { return err(format_underline("toml::parse_table_key: " "not a valid table key", {{source_location(loc), "here"}})); } } inline result, region>, std::string> parse_array_table_key(location& loc) { if(auto token = lex_array_table::invoke(loc)) { location inner_loc(loc.name(), token.unwrap().str()); const auto open = lex_array_table_open::invoke(inner_loc); if(!open || inner_loc.iter() == inner_loc.end()) { throw internal_error(format_underline( "toml::parse_array_table_key: no `[[`", {{source_location(inner_loc), "should be `[[`"}}), source_location(inner_loc)); } lex_ws::invoke(inner_loc); const auto keys = parse_key(inner_loc); if(!keys) { throw internal_error(format_underline( "toml::parse_array_table_key: invalid key", {{source_location(inner_loc), "not a key"}}), source_location(inner_loc)); } lex_ws::invoke(inner_loc); const auto close = lex_array_table_close::invoke(inner_loc); if(!close) { throw internal_error(format_underline( "toml::parse_table_key: no `]]`", {{source_location(inner_loc), "should be `]]`"}}), source_location(inner_loc)); } // after [[table.key]], newline or EOF(empty table) required. if(loc.iter() != loc.end()) { using lex_newline_after_table_key = sequence, maybe, lex_newline>; const auto nl = lex_newline_after_table_key::invoke(loc); if(!nl) { throw syntax_error(format_underline("toml::" "parse_array_table_key: newline required after [[table.key]]", {{source_location(loc), "expected newline"}}), source_location(loc)); } } return ok(std::make_pair(keys.unwrap().first, token.unwrap())); } else { return err(format_underline("toml::parse_array_table_key: " "not a valid table key", {{source_location(loc), "here"}})); } } // parse table body (key-value pairs until the iter hits the next [tablekey]) template result parse_ml_table(location& loc) { using value_type = Value; using table_type = typename value_type::table_type; const auto first = loc.iter(); if(first == loc.end()) { return ok(table_type{}); } // XXX at lest one newline is needed. using skip_line = repeat< sequence, maybe, lex_newline>, at_least<1>>; skip_line::invoke(loc); lex_ws::invoke(loc); table_type tab; while(loc.iter() != loc.end()) { lex_ws::invoke(loc); const auto before = loc.iter(); if(const auto tmp = parse_array_table_key(loc)) // next table found { loc.reset(before); return ok(tab); } if(const auto tmp = parse_table_key(loc)) // next table found { loc.reset(before); return ok(tab); } if(const auto kv = parse_key_value_pair(loc)) { const auto& kvpair = kv.unwrap(); const std::vector& keys = kvpair.first.first; const auto& key_reg = kvpair.first.second; const value_type& val = kvpair.second; const auto inserted = insert_nested_key(tab, val, keys.begin(), keys.end(), key_reg); if(!inserted) { return err(inserted.unwrap_err()); } } else { return err(kv.unwrap_err()); } // comment lines are skipped by the above function call. // However, since the `skip_line` requires at least 1 newline, it fails // if the file ends with ws and/or comment without newline. // `skip_line` matches `ws? + comment? + newline`, not `ws` or `comment` // itself. To skip the last ws and/or comment, call lexers. // It does not matter if these fails, so the return value is discarded. lex_ws::invoke(loc); lex_comment::invoke(loc); // skip_line is (whitespace? comment? newline)_{1,}. multiple empty lines // and comments after the last key-value pairs are allowed. const auto newline = skip_line::invoke(loc); if(!newline && loc.iter() != loc.end()) { const auto before2 = loc.iter(); lex_ws::invoke(loc); // skip whitespace const auto msg = format_underline("toml::parse_table: " "invalid line format", {{source_location(loc), concat_to_string( "expected newline, but got '", show_char(*loc.iter()), "'.")}}); loc.reset(before2); return err(msg); } // the skip_lines only matches with lines that includes newline. // to skip the last line that includes comment and/or whitespace // but no newline, call them one more time. lex_ws::invoke(loc); lex_comment::invoke(loc); } return ok(tab); } template result parse_toml_file(location& loc) { using value_type = Value; using table_type = typename value_type::table_type; const auto first = loc.iter(); if(first == loc.end()) { // For empty files, return an empty table with an empty region (zero-length). // Without the region, error messages would miss the filename. return ok(value_type(table_type{}, region(loc, first, first), {})); } // put the first line as a region of a file // Here first != loc.end(), so taking std::next is okay const region file(loc, first, std::next(loc.iter())); // The first successive comments that are separated from the first value // by an empty line are for a file itself. // ```toml // # this is a comment for a file. // // key = "the first value" // ``` // ```toml // # this is a comment for "the first value". // key = "the first value" // ``` std::vector comments; using lex_first_comments = sequence< repeat, lex_comment, lex_newline>, at_least<1>>, sequence, lex_newline> >; if(const auto token = lex_first_comments::invoke(loc)) { location inner_loc(loc.name(), token.unwrap().str()); while(inner_loc.iter() != inner_loc.end()) { maybe::invoke(inner_loc); // remove ws if exists if(lex_newline::invoke(inner_loc)) { assert(inner_loc.iter() == inner_loc.end()); break; // empty line found. } auto com = lex_comment::invoke(inner_loc).unwrap().str(); com.erase(com.begin()); // remove # sign comments.push_back(std::move(com)); lex_newline::invoke(inner_loc); } } table_type data; // root object is also a table, but without [tablename] if(const auto tab = parse_ml_table(loc)) { data = std::move(tab.unwrap()); } else // failed (empty table is regarded as success in parse_ml_table) { return err(tab.unwrap_err()); } while(loc.iter() != loc.end()) { // here, the region of [table] is regarded as the table-key because // the table body is normally too big and it is not so informative // if the first key-value pair of the table is shown in the error // message. if(const auto tabkey = parse_array_table_key(loc)) { const auto tab = parse_ml_table(loc); if(!tab){return err(tab.unwrap_err());} const auto& tk = tabkey.unwrap(); const auto& keys = tk.first; const auto& reg = tk.second; const auto inserted = insert_nested_key(data, value_type(tab.unwrap(), reg, reg.comments()), keys.begin(), keys.end(), reg, /*is_array_of_table=*/ true); if(!inserted) {return err(inserted.unwrap_err());} continue; } if(const auto tabkey = parse_table_key(loc)) { const auto tab = parse_ml_table(loc); if(!tab){return err(tab.unwrap_err());} const auto& tk = tabkey.unwrap(); const auto& keys = tk.first; const auto& reg = tk.second; const auto inserted = insert_nested_key(data, value_type(tab.unwrap(), reg, reg.comments()), keys.begin(), keys.end(), reg); if(!inserted) {return err(inserted.unwrap_err());} continue; } return err(format_underline("toml::parse_toml_file: " "unknown line appeared", {{source_location(loc), "unknown format"}})); } return ok(Value(std::move(data), file, comments)); } } // detail template class Table = std::unordered_map, template class Array = std::vector> basic_value parse(std::istream& is, const std::string& fname = "unknown file") { using value_type = basic_value; const auto beg = is.tellg(); is.seekg(0, std::ios::end); const auto end = is.tellg(); const auto fsize = end - beg; is.seekg(beg); // read whole file as a sequence of char assert(fsize >= 0); std::vector letters(static_cast(fsize)); is.read(letters.data(), fsize); // append LF. // Although TOML does not require LF at the EOF, to make parsing logic // simpler, we "normalize" the content by adding LF if it does not exist. // It also checks if the last char is CR, to avoid changing the meaning. // This is not the *best* way to deal with the last character, but is a // simple and quick fix. if(!letters.empty() && letters.back() != '\n' && letters.back() != '\r') { letters.push_back('\n'); } detail::location loc(std::move(fname), std::move(letters)); // skip BOM if exists. // XXX component of BOM (like 0xEF) exceeds the representable range of // signed char, so on some (actually, most) of the environment, these cannot // be compared to char. However, since we are always out of luck, we need to // check our chars are equivalent to BOM. To do this, first we need to // convert char to unsigned char to guarantee the comparability. if(loc.source()->size() >= 3) { std::array BOM; std::memcpy(BOM.data(), loc.source()->data(), 3); if(BOM[0] == 0xEF && BOM[1] == 0xBB && BOM[2] == 0xBF) { loc.advance(3); // BOM found. skip. } } const auto data = detail::parse_toml_file(loc); if(!data) { throw syntax_error(data.unwrap_err(), source_location(loc)); } return data.unwrap(); } template class Table = std::unordered_map, template class Array = std::vector> basic_value parse(const std::string& fname) { std::ifstream ifs(fname.c_str(), std::ios_base::binary); if(!ifs.good()) { throw std::runtime_error("toml::parse: file open error -> " + fname); } return parse(ifs, fname); } #ifdef TOML11_HAS_STD_FILESYSTEM // This function just forwards `parse("filename.toml")` to std::string version // to avoid the ambiguity in overload resolution. // // Both std::string and std::filesystem::path are convertible from const char*. // Without this, both parse(std::string) and parse(std::filesystem::path) // matches to parse("filename.toml"). This breaks the existing code. // // This function exactly matches to the invocation with c-string. // So this function is preferred than others and the ambiguity disappears. template class Table = std::unordered_map, template class Array = std::vector> basic_value parse(const char* fname) { return parse(std::string(fname)); } template class Table = std::unordered_map, template class Array = std::vector> basic_value parse(const std::filesystem::path& fpath) { std::ifstream ifs(fpath, std::ios_base::binary); if(!ifs.good()) { throw std::runtime_error("toml::parse: file open error -> " + fpath.string()); } return parse(ifs, fpath.string()); } #endif // TOML11_HAS_STD_FILESYSTEM } // toml #endif// TOML11_PARSER_HPP