From 0ee0ebfb426ad542eaaebf5e9b3fb1a8166e0681 Mon Sep 17 00:00:00 2001 From: Daniel Kolesa Date: Sat, 10 Apr 2021 19:32:01 +0200 Subject: [PATCH] clean up the main parser path --- src/cs_parser.cc | 480 +++++++++++++++++++++++++++-------------------- src/cs_parser.hh | 5 +- 2 files changed, 283 insertions(+), 202 deletions(-) diff --git a/src/cs_parser.cc b/src/cs_parser.cc index 576411c..c4efe2d 100644 --- a/src/cs_parser.cc +++ b/src/cs_parser.cc @@ -307,6 +307,9 @@ bool is_valid_name(std::string_view s) { } } +/* parse out a quoted string; return the raw string, without the quotes + * current parser state will be after the final quote + */ std::string_view parser_state::get_str() { size_t nl; char const *beg = source; @@ -318,12 +321,18 @@ std::string_view parser_state::get_str() { return ret.substr(1, ret.size() - 2); } +/* like the above, but unescapes the string and dups it as a buffer */ charbuf parser_state::get_str_dup() { charbuf buf{ts}; unescape_string(std::back_inserter(buf), get_str()); return buf; } +/* a simple name, used for @foo in macro substitutions + * + * consists of an alpha character (or '_') followed + * by alphanumeric characters (or more '_') + */ std::string_view parser_state::read_macro_name() { char const *op = source; char c = current(); @@ -336,6 +345,7 @@ std::string_view parser_state::read_macro_name() { return std::string_view{op, std::size_t(source - op)}; } +/* advance the parser until we reach any of the given chars, then stop at it */ char parser_state::skip_until(std::string_view chars) { char c = current(); while (c && (chars.find(c) == std::string_view::npos)) { @@ -345,6 +355,7 @@ char parser_state::skip_until(std::string_view chars) { return c; } +/* advance the parser until we reach the given character, then stop at it */ char parser_state::skip_until(char cf) { char c = current(); while (c && (c != cf)) { @@ -382,8 +393,13 @@ void parser_state::skip_comments() { if ((current() != '/') || (current(1) != '/')) { return; } - while (current() != '\n') { - next_char(); + for (;;) { + auto c = current(); + if (c && (c != '\n')) { + next_char(); + } else { + break; + } } } } @@ -1234,220 +1250,282 @@ bool parser_state::parse_id_and_or(ident &id, int ltype) { return more; } -void parser_state::parse_block(int rettype, int brak) { - charbuf idname{gs.ts}; - for (;;) { - skip_comments(); - idname.clear(); - size_t curline = current_line; - bool more = parse_arg(VAL_WORD, &idname); - if (!more) { - goto endstatement; +static bool finish_statement(parser_state &ps, bool more, int term) { + /* skip through any remaining args in the statement */ + if (more) { + while (ps.parse_arg(VAL_POP)) {} + } + /* handle special characters */ + switch (ps.skip_until(")];/\n")) { + /* EOS */ + case '\0': + if (ps.current() != term) { + throw error{*ps.ts.pstate, "missing \"%c\"", char(term)}; + } + return false; + /* terminating parens/brackets */ + case ')': + case ']': + /* if the expected terminator, finish normally */ + if (ps.current() == term) { + ps.next_char(); + return false; + } + throw error{*ps.ts.pstate, "unexpected \"%c\"", ps.current()}; + /* potential comment */ + case '/': + ps.next_char(); + if (ps.current() == '/') { + ps.skip_until('\n'); + } + return finish_statement(ps, false, term); + /* next statement */ + default: + ps.next_char(); + break; + } + /* advance to next statement */ + return true; +} + +bool parser_state::parse_call_id(ident &id, int ltype) { + switch (ident_p{id}.impl().p_type) { + case ID_ALIAS: + return parse_call_alias(static_cast(id)); + case ID_COMMAND: + return parse_call_command( + static_cast(&id), id, ltype + ); + case ID_LOCAL: + return parse_id_local(); + case ID_DO: + return parse_id_do(false, ltype); + case ID_DOARGS: + return parse_id_do(true, ltype); + case ID_IF: + return parse_id_if(id, ltype); + case ID_BREAK: + gs.gen_break(); + return true; + case ID_CONTINUE: + gs.gen_continue(); + return true; + case ID_RESULT: { + bool more = parse_arg(VAL_ANY); + if (!more) { + gs.gen_result_null(ltype); + } else { + gs.gen_result(ltype); + } + return more; } - skip_comments(); - if (current() == '=') { - switch (current(1)) { - case '/': - if (current(2) != '/') { - break; + case ID_NOT: { + bool more = parse_arg(VAL_ANY); + if (!more) { + gs.gen_result_true(ltype); + } else { + gs.gen_not(ltype); + } + return more; + } + case ID_AND: + case ID_OR: + return parse_id_and_or(id, ltype); + case ID_IVAR: { + auto *hid = ts.istate->cmd_ivar; + return parse_call_command( + static_cast(hid), id, ltype + ); + } + case ID_FVAR: { + auto *hid = ts.istate->cmd_fvar; + return parse_call_command( + static_cast(hid), id, ltype + ); + } + case ID_SVAR: { + auto *hid = ts.istate->cmd_svar; + return parse_call_command( + static_cast(hid), id, ltype + ); + } + default: + /* unreachable */ + break; + } + return true; +} + +/* generates a call to an unknown entity on the stack */ +static bool parse_no_id(parser_state &ps, int term) { + std::uint32_t nargs = 0; + /* the entity is already on the stack, parse out any arguments to it */ + while (ps.parse_arg(VAL_ANY)) { + ++nargs; + } + ps.gs.gen_call(nargs); + return finish_statement(ps, false, term); +} + +bool parser_state::parse_assign( + charbuf &idname, int ltype, int term, bool &noass +) { + /* lookahead */ + switch (current(1)) { + /* the = can be followed by a bunch of stuff + * some of these result in empty assignments + */ + case '/': /* a comment maybe? */ + if (current(2) != '/') { + /* not a comment */ + noass = true; + return true; + } + [[fallthrough]]; + case ' ': + case '\t': + case '\r': + case '\n': + case '\0': { + /* skip = */ + next_char(); + /* we had a name on the left hand side */ + if (!idname.empty()) { + idname.push_back('\0'); + /* fetch an ident or make up a fresh one (unknown alias) */ + ident &id = ts.istate->new_ident( + *ts.pstate, idname.str_term(), IDENT_FLAG_UNKNOWN + ); + /* check what we're assigning */ + switch (id.get_type()) { + case ident_type::ALIAS: { + /* alias assignment: parse out any one argument */ + bool more = parse_arg(VAL_ANY); + if (!more) { + gs.gen_val_string(); + } + gs.gen_assign_alias(id); + return finish_statement(*this, more, term); } - [[fallthrough]]; - case ';': - case ' ': - case '\t': - case '\r': - case '\n': - case '\0': - next_char(); - if (!idname.empty()) { - idname.push_back('\0'); - ident &id = ts.istate->new_ident( - *ts.pstate, idname.str_term(), IDENT_FLAG_UNKNOWN + case ident_type::IVAR: { + auto *hid = ts.istate->cmd_ivar; + bool more = parse_call_command( + static_cast(hid), + id, ltype, 1 ); - switch (id.get_type()) { - case ident_type::ALIAS: - more = parse_arg(VAL_ANY); - if (!more) { - gs.gen_val_string(); - } - gs.gen_assign_alias(id); - goto endstatement; - case ident_type::IVAR: { - auto *hid = ts.istate->cmd_ivar; - more = parse_call_command( - static_cast(hid), - id, rettype, 1 - ); - goto endstatement; - } - case ident_type::FVAR: { - auto *hid = ts.istate->cmd_fvar; - more = parse_call_command( - static_cast(hid), - id, rettype, 1 - ); - goto endstatement; - } - case ident_type::SVAR: { - auto *hid = ts.istate->cmd_svar; - more = parse_call_command( - static_cast(hid), - id, rettype, 1 - ); - goto endstatement; - } - default: - break; - } - gs.gen_val_string(idname.str_term()); + return finish_statement(*this, more, term); } - more = parse_arg(VAL_ANY); - if (!more) { - gs.gen_val_string(); + case ident_type::FVAR: { + auto *hid = ts.istate->cmd_fvar; + bool more = parse_call_command( + static_cast(hid), + id, ltype, 1 + ); + return finish_statement(*this, more, term); } - gs.gen_assign(); - goto endstatement; - } - } - if (idname.empty()) { -noid: - std::uint32_t numargs = 0; - for (;;) { - more = parse_arg(VAL_ANY); - if (!more) { - break; - } - ++numargs; - } - gs.gen_call(numargs); - } else { - idname.push_back('\0'); - ident *id = ts.pstate->get_ident(idname.str_term()); - if (!id) { - if (is_valid_name(idname.str_term())) { - gs.gen_val_string(idname.str_term()); - goto noid; - } - switch (rettype) { - case VAL_ANY: { - std::string_view end = idname.str_term(); - integer_type val = parse_int(end, &end); - if (!end.empty()) { - gs.gen_val_string(idname.str_term()); - } else { - gs.gen_val_integer(val); - } - break; + case ident_type::SVAR: { + auto *hid = ts.istate->cmd_svar; + bool more = parse_call_command( + static_cast(hid), + id, ltype, 1 + ); + return finish_statement(*this, more, term); } default: - gs.gen_val(rettype, idname.str_term(), int(curline)); break; } - gs.gen_result(); - } else { - switch (ident_p{*id}.impl().p_type) { - case ID_ALIAS: - more = parse_call_alias(static_cast(*id)); - break; - case ID_COMMAND: - more = parse_call_command( - static_cast(id), *id, rettype - ); - break; - case ID_LOCAL: - more = parse_id_local(); - break; - case ID_DO: - more = parse_id_do(false, rettype); - break; - case ID_DOARGS: - more = parse_id_do(true, rettype); - break; - case ID_IF: - more = parse_id_if(*id, rettype); - break; - case ID_BREAK: - gs.gen_break(); - break; - case ID_CONTINUE: - gs.gen_continue(); - break; - case ID_RESULT: - if (more) { - more = parse_arg(VAL_ANY); - } - if (!more) { - gs.gen_result_null(rettype); - } else { - gs.gen_result(rettype); - } - break; - case ID_NOT: - if (more) { - more = parse_arg(VAL_ANY); - } - if (!more) { - gs.gen_result_true(rettype); - } else { - gs.gen_not(rettype); - } - break; - case ID_AND: - case ID_OR: - more = parse_id_and_or(*id, rettype); - break; - case ID_IVAR: { - auto *hid = ts.istate->cmd_ivar; - more = parse_call_command( - static_cast(hid), *id, rettype - ); - break; - } - case ID_FVAR: { - auto *hid = ts.istate->cmd_fvar; - more = parse_call_command( - static_cast(hid), *id, rettype - ); - break; - } - case ID_SVAR: { - auto *hid = ts.istate->cmd_svar; - more = parse_call_command( - static_cast(hid), *id, rettype - ); - break; - } - } + gs.gen_val_string(idname.str_term()); + } + /* unknown thing, make it the VM's problem */ + bool more = parse_arg(VAL_ANY); + if (!more) { + gs.gen_val_string(); + } + gs.gen_assign(); + return finish_statement(*this, more, term); + } + /* not followed by any of these: not an assignment */ + default: + noass = true; + return true; + } + return true; +} + +void parser_state::parse_block(int ltype, int term) { + charbuf idname{gs.ts}; + /* the main statement parse loop */ + for (;;) { + /* first, skip any comments in the way and prepare the env */ + skip_comments(); + idname.clear(); + std::size_t curline = current_line; + bool more = true; + /* parse the left hand side of the statement */ + if (!parse_arg(VAL_WORD, &idname)) { + if (!finish_statement(*this, more, term)) { + return; + } + continue; + } + skip_comments(); + /* potentially an assignment */ + if (current() == '=') { + bool noass = false; + if (!parse_assign(idname, ltype, term, noass)) { + /* terminated */ + return; + } + if (!noass) { + /* was actually an assignment */ + continue; } } -endstatement: - if (more) { - while (parse_arg(VAL_POP)); + /* we didn't get a name to look up: treat as unknown */ + if (idname.empty()) { + if (!parse_no_id(*this, term)) { + return; + } + continue; } - switch (skip_until(")];/\n")) { - case '\0': - if (current() != brak) { - throw error{*ts.pstate, "missing \"%c\"", char(brak)}; + idname.push_back('\0'); + auto idstr = idname.str_term(); + ident *id = ts.pstate->get_ident(idstr); + if (!id) { + /* no such ident exists but the name is valid, which means + * it's a syntactically ok call, make it the VM's problem + */ + if (is_valid_name(idstr)) { + /* VAL_WORD does not codegen, put the name on the stack */ + gs.gen_val_string(idstr); + if (!parse_no_id(*this, term)) { return; } - return; - case ')': - case ']': - if (current() == brak) { - next_char(); - return; + continue; + } + /* not a valid command name: treat like an expression */ + switch (ltype) { + case VAL_ANY: { + auto end = idstr; + auto val = parse_int(idstr, &end); + if (!end.empty()) { + gs.gen_val_string(idstr); + } else { + gs.gen_val_integer(val); + } + break; } - throw error{*ts.pstate, "unexpected \"%c\"", current()}; - return; - case '/': - next_char(); - if (current() == '/') { - skip_until('\n'); - } - goto endstatement; - default: - next_char(); - break; + default: + gs.gen_val(ltype, idname.str_term(), int(curline)); + break; + } + gs.gen_result(); + continue; + } + /* the ident exists; treat like a call according to its type */ + more = parse_call_id(*id, ltype); + if (!finish_statement(*this, more, term)) { + return; } } } diff --git a/src/cs_parser.hh b/src/cs_parser.hh index 61997e0..8e7b10c 100644 --- a/src/cs_parser.hh +++ b/src/cs_parser.hh @@ -42,7 +42,7 @@ struct parser_state { std::string_view get_word(); - void parse_block(int ret_type, int term = '\0'); + void parse_block(int ltype, int term = '\0'); void next_char() { if (source == send) { @@ -77,6 +77,9 @@ struct parser_state { command_impl *id, ident &self, int rettype, std::uint32_t limit = 0 ); bool parse_call_alias(alias &id); + bool parse_call_id(ident &id, int ltype); + + bool parse_assign(charbuf &idname, int ltype, int term, bool &noass); bool parse_id_local(); bool parse_id_do(bool args, int ltype);