Add more characters that should be ommitted from STARTCHAR

and add an explanation for why this is the case.
This commit is contained in:
Jos van den Oever 2024-09-02 12:49:56 +02:00
parent ecc5a1aa00
commit c0b60b1c4f
2 changed files with 32 additions and 10 deletions

View file

@ -1,7 +1,7 @@
takentaal v0.1.0 takentaal v0.1.0
# {5000} Full work plan # {5000} Full work plan
This is the description of the entire work plan. This is the (draft) description of the entire work plan.
## {1000} First task ## {1000} First task
@ -12,9 +12,9 @@ This description has # ' " [] symbols and ü © Ð Latin-1 chars.
- {500} First subtask - {500} First subtask
/ {500} Second subtask / {500} Second subtask
## Second task {1000} ## {1000} Second task
This is the description of the second task. This is the description of the second task.
* {500} First subtask * {500} First subtask
- {500} Second subtask - {500} \2nd subtask

View file

@ -1,3 +1,17 @@
/**
* This file defines the grammar for takentaal.
* It is divided into parser rules (lowercase) and lexer rules (uppercase).
* The parser splits an input into tokens accoring to the lexer rules.
* At any point, all lexer rules are considered. If multiple rules match,
* a lexer rule is chosen as follows:
* - the rule that matches the longest input is chosen
* - any implicit rule, e.g. 'a', is chosen
* - the first defined rule is chosen.
* Since this grammar has to match unquoted texts and text are usually longer
* than other token matches, the TEXT rule disallows many characters as the
* first character to start with.
*/
grammar takentaal; grammar takentaal;
takentaal takentaal
@ -73,14 +87,14 @@ WS
; ;
EOL EOL
: '\n'+ : ' '* '\n'+
; ;
INT INT
: DIGIT+ : DIGIT+
; ;
DIGIT fragment DIGIT
: [0-9] : [0-9]
; ;
@ -92,14 +106,22 @@ END_AMOUNT
: '}' : '}'
; ;
STARTCHAR // all special characters, including ' ' and digits are subtracted from the printable character range
: [!-"$-/:-\u007A\u007C\u007E] // '!' '#' '-' '/' '*'
fragment STARTCHAR
: ["$-)+-,.:-z|~\u00A0-\u33FF]
; ;
CHAR // A text should not end with a space, so the ENDHAR omits the space
: [ -\u007E\u00A0-\u33FF] // ASCII and UNICODE fragment ENDCHAR
: ["-~\u00A0-\u33FF]
; ;
fragment CHAR
: [ -~\u00A0-\u33FF] // ASCII and UNICODE
;
// A text cannot start with a special character or has to be placed in quotes
TEXT TEXT
: STARTCHAR CHAR* : STARTCHAR (CHAR* ENDCHAR)?
; ;