Add more characters that should be ommitted from STARTCHAR

and add an explanation for why this is the case.
2025-10-27 09:59:22 +00:00 · 2024-09-02 12:49:56 +02:00 · 2024-09-02 12:49:56 +02:00 · c0b60b1c4f
commit c0b60b1c4f
parent ecc5a1aa00
2 changed files with 32 additions and 10 deletions
--- a/6
+++ b/6
@ -1,7 +1,7 @@
 takentaal v0.1.0
 #  {5000} Full work plan
-This is the description of the entire work plan.
+This is the (draft) description of the entire work plan.
 ## {1000} First task
@ -12,9 +12,9 @@ This description has # ' " [] symbols and ü © Ð Latin-1 chars.
 -   {500} First subtask
 /   {500} Second subtask
-## Second task {1000}
+## {1000} Second task
 This is the description of the second task.
 *   {500} First subtask
- {500} Second subtask
+- {500} \2nd subtask
--- a/takentaal.g4
+++ b/takentaal.g4
@ -1,3 +1,17 @@
 /**
 * This file defines the grammar for takentaal.
 * It is divided into parser rules (lowercase) and lexer rules (uppercase).
 * The parser splits an input into tokens accoring to the lexer rules.
 * At any point, all lexer rules are considered. If multiple rules match,
 * a lexer rule is chosen as follows:
 *  - the rule that matches the longest input is chosen
 *  - any implicit rule, e.g. 'a', is chosen
 *  - the first defined rule is chosen.
 * Since this grammar has to match unquoted texts and text are usually longer
 * than other token matches, the TEXT rule disallows many characters as the
 * first character to start with.
 */
 grammar takentaal;
 takentaal
@ -73,14 +87,14 @@ WS
    ;
 EOL
-    : '\n'+
+    : ' '* '\n'+
    ;
 INT
    : DIGIT+
    ;
-DIGIT
+fragment DIGIT
    : [0-9]
    ;
@ -92,14 +106,22 @@ END_AMOUNT
    : '}'
    ;
-STARTCHAR
+// all special characters, including ' ' and digits are subtracted from the printable character range
-    : [!-"$-/:-\u007A\u007C\u007E]
+// '!' '#' '-' '/' '*'
 fragment STARTCHAR
    : ["$-)+-,.:-z|~\u00A0-\u33FF]
    ;
-CHAR
+// A text should not end with a space, so the ENDHAR omits the space
-    : [ -\u007E\u00A0-\u33FF] // ASCII and UNICODE
+fragment ENDCHAR
    : ["-~\u00A0-\u33FF]
    ;
 fragment CHAR
    : [ -~\u00A0-\u33FF] // ASCII and UNICODE
    ;
 // A text cannot start with a special character or has to be placed in quotes
 TEXT
-    : STARTCHAR CHAR*
+    : STARTCHAR (CHAR* ENDCHAR)?
    ;