takentaal/takentaal.g4

/**
 * This file defines the grammar for takentaal.
 * It is divided into parser rules (lowercase) and lexer rules (uppercase).
 * The parser splits an input into tokens accoring to the lexer rules.
 * At any point, all lexer rules are considered. If multiple rules match,
 * a lexer rule is chosen as follows:
 *  - the rule that matches the longest input is chosen
 *  - any implicit rule, e.g. 'a', is chosen
 *  - the first defined rule is chosen.
 * Since this grammar has to match unquoted texts and text are usually longer
 * than other token matches, texts are split into characters so that they have
 * a lower ranking.
 */

grammar takentaal;

takentaal
    : header
      plan
    ;

header
    : 'takentaal v0.1.0'
    ;

text
    : (S | CHAR | WORD)+
    ;

plan
    : PLAN_TOKEN S* amount text
      description
      task+
    ;

description
    : (EOL text)*
    ;

task
    : TASK_TOKEN S* amount text
      description
      subtask*
    ;

subtask
    : SUBTASK_TOKEN S* amount text
      description
    ;

amount
    : START_AMOUNT S* INT END_AMOUNT
    |
    ;

PLAN_TOKEN
    : EOL+ '#'
    ;

TASK_TOKEN
    : EOL+ '##'
    ;

SUBTASK_TOKEN
    : EOL+ (SUBTASK_NEW_TOKEN | SUBTASK_PARTIAL_TOKEN | SUBTASK_COMPLETE_TOKEN | SUBTASK_OBSOLETE_TOKEN)
    ;

SUBTASK_NEW_TOKEN
    : '-'
    ;

SUBTASK_PARTIAL_TOKEN
    : '/'
    ;

SUBTASK_COMPLETE_TOKEN
    : '*'
    ;

SUBTASK_OBSOLETE_TOKEN
    : '!'
    ;

S
    : ' '
    ;


WS
    : [ ] -> skip
    ;

EOL
    : ' '* '\n'+
    ;

INT
    : DIGIT+
    ;

fragment DIGIT
    : [0-9]
    ;

START_AMOUNT
    : '{'
    ;

END_AMOUNT
    : '}'
    ;

// Match printable characters, except space which is covered by S
CHAR
    : [!-~\u00A0-\u33FF] // ASCII and UNICODE
    ;

// This is a performance improvement that groups chars that do not have a special meaning
WORD
    : [A-Za-z\u00A0-\u33FF]+
    ;