Split description text into character tokens so that text has a lower ranking

This commit is contained in:
Jos van den Oever 2024-09-02 14:00:51 +02:00
parent c0b60b1c4f
commit f9e92b8152

View file

@ -8,8 +8,8 @@
* - any implicit rule, e.g. 'a', is chosen * - any implicit rule, e.g. 'a', is chosen
* - the first defined rule is chosen. * - the first defined rule is chosen.
* Since this grammar has to match unquoted texts and text are usually longer * Since this grammar has to match unquoted texts and text are usually longer
* than other token matches, the TEXT rule disallows many characters as the * than other token matches, texts are split into characters so that they have
* first character to start with. * a lower ranking.
*/ */
grammar takentaal; grammar takentaal;
@ -20,27 +20,31 @@ takentaal
; ;
header header
: 'takentaal v0.1.0' EOL : 'takentaal v0.1.0'
;
text
: (S | CHAR | WORD)+
; ;
plan plan
: PLAN_TOKEN S* amount TEXT EOL : PLAN_TOKEN S* amount text
description description
task+ task+
; ;
description description
: (TEXT EOL)* : (EOL text)*
; ;
task task
: TASK_TOKEN S* amount TEXT EOL : TASK_TOKEN S* amount text
description description
subtask* subtask*
; ;
subtask subtask
: SUBTASK_TOKEN S* amount TEXT EOL : SUBTASK_TOKEN S* amount text
description description
; ;
@ -50,15 +54,15 @@ amount
; ;
PLAN_TOKEN PLAN_TOKEN
: '#' : EOL+ '#'
; ;
TASK_TOKEN TASK_TOKEN
: '##' : EOL+ '##'
; ;
SUBTASK_TOKEN SUBTASK_TOKEN
: (SUBTASK_NEW_TOKEN | SUBTASK_PARTIAL_TOKEN | SUBTASK_COMPLETE_TOKEN | SUBTASK_OBSOLETE_TOKEN) : EOL+ (SUBTASK_NEW_TOKEN | SUBTASK_PARTIAL_TOKEN | SUBTASK_COMPLETE_TOKEN | SUBTASK_OBSOLETE_TOKEN)
; ;
SUBTASK_NEW_TOKEN SUBTASK_NEW_TOKEN
@ -78,7 +82,7 @@ SUBTASK_OBSOLETE_TOKEN
; ;
S S
: ' ' -> skip : ' '
; ;
@ -106,22 +110,12 @@ END_AMOUNT
: '}' : '}'
; ;
// all special characters, including ' ' and digits are subtracted from the printable character range // Match printable characters, except space which is covered by S
// '!' '#' '-' '/' '*' CHAR
fragment STARTCHAR : [!-~\u00A0-\u33FF] // ASCII and UNICODE
: ["$-)+-,.:-z|~\u00A0-\u33FF]
; ;
// A text should not end with a space, so the ENDHAR omits the space // This is a performance improvement that groups chars that do not have a special meaning
fragment ENDCHAR WORD
: ["-~\u00A0-\u33FF] : [A-Za-z\u00A0-\u33FF]+
;
fragment CHAR
: [ -~\u00A0-\u33FF] // ASCII and UNICODE
;
// A text cannot start with a special character or has to be placed in quotes
TEXT
: STARTCHAR (CHAR* ENDCHAR)?
; ;