/** * This file defines the grammar for takentaal. * It is divided into parser rules (lowercase) and lexer rules (uppercase). * The parser splits an input into tokens accoring to the lexer rules. * At any point, all lexer rules are considered. If multiple rules match, * a lexer rule is chosen as follows: * - the rule that matches the longest input is chosen * - any implicit rule, e.g. 'a', is chosen * - the first defined rule is chosen. * Since this grammar has to match unquoted texts and text are usually longer * than other token matches, texts are split into characters so that they have * a lower ranking. */ grammar takentaal; takentaal : (takentaal_v1_0 | amendment_v1_0) EOF ; takentaal_v1_0 : 'takentaal v1.0' t1_0_plan EOL* ; t1_0_plan : PLAN_TOKEN S* amount text description t1_0_task+ ; t1_0_task : TASK_TOKEN S* amount text description t1_0_subtask* ; t1_0_subtask : SUBTASK_NEW_TOKEN S* amount text description ; amendment_v1_0 : 'takentaal-amendment v1.0' a1_0_plan EOL* ; a1_0_plan : PLAN_TOKEN S* amount text description a1_0_task+ ; a1_0_task : TASK_TOKEN S* amount text description a1_0_subtask* ; a1_0_subtask : a1_0_subtask_token S* amount text description ; a1_0_subtask_token : SUBTASK_NEW_TOKEN | SUBTASK_PARTIAL_TOKEN | SUBTASK_COMPLETE_TOKEN | SUBTASK_OBSOLETE_TOKEN ; // Any implicit and explity lexer token that may appear in a text should be listed // in this definition. text : (INT | '{' | '}' | S | CHAR | WORD)+ ; description : (EOL text)* ; // An amount in curly brackes or nothing amount : '{' S* INT S* '}' | ; // Lexer rules PLAN_TOKEN : EOL+ '#' ; TASK_TOKEN : EOL+ '##' ; // These lexer tokens are combined with EOL so that they only match at the // start of a line. SUBTASK_NEW_TOKEN : EOL+ '-' ; SUBTASK_PARTIAL_TOKEN : EOL+ '/' ; SUBTASK_COMPLETE_TOKEN : EOL+ '*' ; SUBTASK_OBSOLETE_TOKEN : EOL+ '!' ; S : ' ' ; // No implicit whitespace handling WS : [ ] -> skip ; EOL : ' '* '\n'+ ; INT : DIGIT+ ; fragment DIGIT : [0-9] ; // Match printable characters, except space which is covered by S CHAR : [!-~\u00A0-\u33FF] ; // ASCII and UNICODE // This is a performance improvement that groups chars that do not have a special meaning WORD : [A-Za-z\u00A0-\u33FF]+ ;