// Copyright Toru Niina 2017. // Distributed under the MIT License. #ifndef TOML11_LEXER_HPP #define TOML11_LEXER_HPP #include #include #include #include #include "combinator.hpp" namespace toml { namespace detail { // these scans contents from current location in a container of char // and extract a region that matches their own pattern. // to see the implementation of each component, see combinator.hpp. using lex_wschar = either, character<'\t'>>; using lex_ws = repeat>; using lex_newline = either, sequence, character<'\n'>>>; using lex_lower = in_range<'a', 'z'>; using lex_upper = in_range<'A', 'Z'>; using lex_alpha = either; using lex_digit = in_range<'0', '9'>; using lex_nonzero = in_range<'1', '9'>; using lex_oct_dig = in_range<'0', '7'>; using lex_bin_dig = in_range<'0', '1'>; using lex_hex_dig = either, in_range<'a', 'f'>>; using lex_hex_prefix = sequence, character<'x'>>; using lex_oct_prefix = sequence, character<'o'>>; using lex_bin_prefix = sequence, character<'b'>>; using lex_underscore = character<'_'>; using lex_plus = character<'+'>; using lex_minus = character<'-'>; using lex_sign = either; // digit | nonzero 1*(digit | _ digit) using lex_unsigned_dec_int = either>, at_least<1>>>, lex_digit>; // (+|-)? unsigned_dec_int using lex_dec_int = sequence, lex_unsigned_dec_int>; // hex_prefix hex_dig *(hex_dig | _ hex_dig) using lex_hex_int = sequence>, unlimited>>>; // oct_prefix oct_dig *(oct_dig | _ oct_dig) using lex_oct_int = sequence>, unlimited>>>; // bin_prefix bin_dig *(bin_dig | _ bin_dig) using lex_bin_int = sequence>, unlimited>>>; // (dec_int | hex_int | oct_int | bin_int) using lex_integer = either; // =========================================================================== using lex_inf = sequence, character<'n'>, character<'f'>>; using lex_nan = sequence, character<'a'>, character<'n'>>; using lex_special_float = sequence, either>; using lex_zero_prefixable_int = sequence>, unlimited>>; using lex_fractional_part = sequence, lex_zero_prefixable_int>; using lex_exponent_part = sequence, character<'E'>>, maybe, lex_zero_prefixable_int>; using lex_float = either>>>>; // =========================================================================== using lex_true = sequence, character<'r'>, character<'u'>, character<'e'>>; using lex_false = sequence, character<'a'>, character<'l'>, character<'s'>, character<'e'>>; using lex_boolean = either; // =========================================================================== using lex_date_fullyear = repeat>; using lex_date_month = repeat>; using lex_date_mday = repeat>; using lex_time_delim = either, character<'t'>, character<' '>>; using lex_time_hour = repeat>; using lex_time_minute = repeat>; using lex_time_second = repeat>; using lex_time_secfrac = sequence, repeat>>; using lex_time_numoffset = sequence, character<'-'>>, sequence, lex_time_minute>>; using lex_time_offset = either, character<'z'>, lex_time_numoffset>; using lex_partial_time = sequence, lex_time_minute, character<':'>, lex_time_second, maybe>; using lex_full_date = sequence, lex_date_month, character<'-'>, lex_date_mday>; using lex_full_time = sequence; using lex_offset_date_time = sequence; using lex_local_date_time = sequence; using lex_local_date = lex_full_date; using lex_local_time = lex_partial_time; // =========================================================================== using lex_quotation_mark = character<'"'>; using lex_basic_unescaped = exclude, // 0x09 (tab) is allowed in_range<0x0A, 0x1F>, character<0x22>, character<0x5C>, character<0x7F>>>; using lex_escape = character<'\\'>; using lex_escape_unicode_short = sequence, repeat>>; using lex_escape_unicode_long = sequence, repeat>>; using lex_escape_seq_char = either, character<'\\'>, character<'b'>, character<'f'>, character<'n'>, character<'r'>, character<'t'>, lex_escape_unicode_short, lex_escape_unicode_long >; using lex_escaped = sequence; using lex_basic_char = either; using lex_basic_string = sequence, lex_quotation_mark>; // After toml post-v0.5.0, it is explicitly clarified how quotes in ml-strings // are allowed to be used. // After this, the following strings are *explicitly* allowed. // - One or two `"`s in a multi-line basic string is allowed wherever it is. // - Three consecutive `"`s in a multi-line basic string is considered as a delimiter. // - One or two `"`s can appear just before or after the delimiter. // ```toml // str4 = """Here are two quotation marks: "". Simple enough.""" // str5 = """Here are three quotation marks: ""\".""" // str6 = """Here are fifteen quotation marks: ""\"""\"""\"""\"""\".""" // str7 = """"This," she said, "is just a pointless statement."""" // ``` // In the current implementation (v3.3.0), it is difficult to parse `str7` in // the above example. It is difficult to recognize `"` at the end of string body // collectly. It will be misunderstood as a `"""` delimiter and an additional, // invalid `"`. Like this: // ```console // what(): [error] toml::parse_table: invalid line format // --> hoge.toml // | // 13 | str7 = """"This," she said, "is just a pointless statement."""" // | ^- expected newline, but got '"'. // ``` // As a quick workaround for this problem, `lex_ml_basic_string_delim` was // split into two, `lex_ml_basic_string_open` and `lex_ml_basic_string_close`. // `lex_ml_basic_string_open` allows only `"""`. `_close` allows 3-5 `"`s. // In parse_ml_basic_string() function, the trailing `"`s will be attached to // the string body. // using lex_ml_basic_string_delim = repeat>; using lex_ml_basic_string_open = lex_ml_basic_string_delim; using lex_ml_basic_string_close = sequence< repeat>, maybe, maybe >; using lex_ml_basic_unescaped = exclude, // 0x09 is tab in_range<0x0A, 0x1F>, character<0x5C>, // backslash character<0x7F>, // DEL lex_ml_basic_string_delim>>; using lex_ml_basic_escaped_newline = sequence< lex_escape, maybe, lex_newline, repeat, unlimited>>; using lex_ml_basic_char = either; using lex_ml_basic_body = repeat, unlimited>; using lex_ml_basic_string = sequence; using lex_literal_char = exclude, in_range<0x0A, 0x1F>, character<0x7F>, character<0x27>>>; using lex_apostrophe = character<'\''>; using lex_literal_string = sequence, lex_apostrophe>; // the same reason as above. using lex_ml_literal_string_delim = repeat>; using lex_ml_literal_string_open = lex_ml_literal_string_delim; using lex_ml_literal_string_close = sequence< repeat>, maybe, maybe >; using lex_ml_literal_char = exclude, in_range<0x0A, 0x1F>, character<0x7F>, lex_ml_literal_string_delim>>; using lex_ml_literal_body = repeat, unlimited>; using lex_ml_literal_string = sequence; using lex_string = either; // =========================================================================== using lex_dot_sep = sequence, character<'.'>, maybe>; using lex_unquoted_key = repeat, character<'_'>>, at_least<1>>; using lex_quoted_key = either; using lex_simple_key = either; using lex_dotted_key = sequence, at_least<1> > >; using lex_key = either; using lex_keyval_sep = sequence, character<'='>, maybe>; using lex_std_table_open = character<'['>; using lex_std_table_close = character<']'>; using lex_std_table = sequence, lex_key, maybe, lex_std_table_close>; using lex_array_table_open = sequence; using lex_array_table_close = sequence; using lex_array_table = sequence, lex_key, maybe, lex_array_table_close>; using lex_utf8_1byte = in_range<0x00, 0x7F>; using lex_utf8_2byte = sequence< in_range(0xC2), static_cast(0xDF)>, in_range(0x80), static_cast(0xBF)> >; using lex_utf8_3byte = sequence(0xE0)>, in_range(0xA0), static_cast(0xBF)>>, sequence(0xE1), static_cast(0xEC)>, in_range(0x80), static_cast(0xBF)>>, sequence(0xED)>, in_range(0x80), static_cast(0x9F)>>, sequence(0xEE), static_cast(0xEF)>, in_range(0x80), static_cast(0xBF)>> >, in_range(0x80), static_cast(0xBF)>>; using lex_utf8_4byte = sequence(0xF0)>, in_range(0x90), static_cast(0xBF)>>, sequence(0xF1), static_cast(0xF3)>, in_range(0x80), static_cast(0xBF)>>, sequence(0xF4)>, in_range(0x80), static_cast(0x8F)>> >, in_range(0x80), static_cast(0xBF)>, in_range(0x80), static_cast(0xBF)>>; using lex_utf8_code = either< lex_utf8_1byte, lex_utf8_2byte, lex_utf8_3byte, lex_utf8_4byte >; using lex_comment_start_symbol = character<'#'>; using lex_non_eol_ascii = either, in_range<0x20, 0x7E>>; using lex_comment = sequence, unlimited>>; } // detail } // toml #endif // TOML_LEXER_HPP