takentaal/takentaal.g4

/**
 * This file defines the grammar for takentaal.
 * It is divided into parser rules (lowercase names) and lexer rules (uppercase
 * names).
 * The parser splits an input into tokens accoring to the lexer rules.
 * At any point in the input text, all lexer rules are considered. If multiple
 * rules match, a lexer rule is chosen as follows:
 *  - the rule that matches the longest input is chosen
 *  - any implicit rule, e.g. 'a', is chosen
 *  - the first defined rule is chosen.
 * Since this grammar has to match unquoted texts and texts are usually longer
 * than other token matches, texts are split into characters so that they have
 * a lower ranking.
 */

grammar takentaal;

takentaal : (takentaal_v1_0 | amendment_v1_0) EOF ;

takentaal_v1_0
    : 'takentaal v1.0'
      t1_0_plan
      EOL*
    ;

t1_0_plan
    : PLAN_TOKEN S* amount text
      description
      t1_0_task+
    ;

t1_0_task
    : TASK_TOKEN S* amount text
      description
      t1_0_subtask*
    ;

t1_0_subtask
    : SUBTASK_NEW_TOKEN S* amount text
      description
    ;

amendment_v1_0
    : 'takentaal-amendment v1.0'
      a1_0_plan
      EOL*
    ;

a1_0_plan
    : PLAN_TOKEN S* amount text
      description
      a1_0_task+
    ;

a1_0_task
    : TASK_TOKEN S* amount text
      description
      a1_0_subtask*
    ;

a1_0_subtask
    : a1_0_subtask_token S* amount text
      description
    ;

a1_0_subtask_token
    : SUBTASK_NEW_TOKEN
    | SUBTASK_PARTIAL_TOKEN
    | SUBTASK_COMPLETE_TOKEN
    | SUBTASK_OBSOLETE_TOKEN
    ;

// Any implicit and explity lexer token that may appear in a text should be
// listed in this definition.
text
    : (INT | '{' | '}' | S | CHAR | WORD)+
    ;

description : (EOL text)* ;

// An amount in curly brackes or nothing
amount : '{' S* INT S* '} ' | ;

// Lexer rules

PLAN_TOKEN : EOL+ '# ' ;

TASK_TOKEN : EOL+ '## ' ;

// These lexer tokens are combined with EOL so that they only match at the
// start of a line.
SUBTASK_NEW_TOKEN      : EOL+ '- ' ;
SUBTASK_PARTIAL_TOKEN  : EOL+ '/ ' ;
SUBTASK_COMPLETE_TOKEN : EOL+ '* ' ;
SUBTASK_OBSOLETE_TOKEN : EOL+ '! ' ;

S : ' ' ;

// No implicit whitespace handling
WS : [ ] -> skip ;

EOL : ' '* '\n'+ ;

INT : DIGIT+ ;

fragment DIGIT : [0-9] ;

// Match printable characters, except space which is covered by S
CHAR : [!-~\u00A0-\u33FF] ; // ASCII and UNICODE

// This is a performance improvement that groups chars that do not have a
// special meaning
WORD : [A-Za-z\u00A0-\u33FF]+ ;