diff --git a/takentaal.g4 b/takentaal.g4 index f8c8d83..9c42dde 100644 --- a/takentaal.g4 +++ b/takentaal.g4 @@ -8,8 +8,8 @@ * - any implicit rule, e.g. 'a', is chosen * - the first defined rule is chosen. * Since this grammar has to match unquoted texts and text are usually longer - * than other token matches, the TEXT rule disallows many characters as the - * first character to start with. + * than other token matches, texts are split into characters so that they have + * a lower ranking. */ grammar takentaal; @@ -20,27 +20,31 @@ takentaal ; header - : 'takentaal v0.1.0' EOL + : 'takentaal v0.1.0' + ; + +text + : (S | CHAR | WORD)+ ; plan - : PLAN_TOKEN S* amount TEXT EOL + : PLAN_TOKEN S* amount text description task+ ; description - : (TEXT EOL)* + : (EOL text)* ; task - : TASK_TOKEN S* amount TEXT EOL + : TASK_TOKEN S* amount text description subtask* ; subtask - : SUBTASK_TOKEN S* amount TEXT EOL + : SUBTASK_TOKEN S* amount text description ; @@ -50,15 +54,15 @@ amount ; PLAN_TOKEN - : '#' + : EOL+ '#' ; TASK_TOKEN - : '##' + : EOL+ '##' ; SUBTASK_TOKEN - : (SUBTASK_NEW_TOKEN | SUBTASK_PARTIAL_TOKEN | SUBTASK_COMPLETE_TOKEN | SUBTASK_OBSOLETE_TOKEN) + : EOL+ (SUBTASK_NEW_TOKEN | SUBTASK_PARTIAL_TOKEN | SUBTASK_COMPLETE_TOKEN | SUBTASK_OBSOLETE_TOKEN) ; SUBTASK_NEW_TOKEN @@ -78,7 +82,7 @@ SUBTASK_OBSOLETE_TOKEN ; S - : ' ' -> skip + : ' ' ; @@ -106,22 +110,12 @@ END_AMOUNT : '}' ; -// all special characters, including ' ' and digits are subtracted from the printable character range -// '!' '#' '-' '/' '*' -fragment STARTCHAR - : ["$-)+-,.:-z|~\u00A0-\u33FF] +// Match printable characters, except space which is covered by S +CHAR + : [!-~\u00A0-\u33FF] // ASCII and UNICODE ; -// A text should not end with a space, so the ENDHAR omits the space -fragment ENDCHAR - : ["-~\u00A0-\u33FF] - ; - -fragment CHAR - : [ -~\u00A0-\u33FF] // ASCII and UNICODE - ; - -// A text cannot start with a special character or has to be placed in quotes -TEXT - : STARTCHAR (CHAR* ENDCHAR)? +// This is a performance improvement that groups chars that do not have a special meaning +WORD + : [A-Za-z\u00A0-\u33FF]+ ;