Split description text into character tokens so that text has a lower ranking

2025-10-27 09:59:22 +00:00 · 2024-09-02 14:00:51 +02:00 · 2024-09-02 14:00:51 +02:00 · f9e92b8152
commit f9e92b8152
parent c0b60b1c4f
1 changed files with 21 additions and 27 deletions
--- a/takentaal.g4
+++ b/takentaal.g4
@ -8,8 +8,8 @@
 *  - any implicit rule, e.g. 'a', is chosen
 *  - the first defined rule is chosen.
 * Since this grammar has to match unquoted texts and text are usually longer
- * than other token matches, the TEXT rule disallows many characters as the
+ * than other token matches, texts are split into characters so that they have
- * first character to start with.
+ * a lower ranking.
 */
 grammar takentaal;
@ -20,27 +20,31 @@ takentaal
    ;
 header
-    : 'takentaal v0.1.0' EOL
+    : 'takentaal v0.1.0'
    ;
 text
    : (S | CHAR | WORD)+
    ;
 plan
-    : PLAN_TOKEN S* amount TEXT EOL
+    : PLAN_TOKEN S* amount text
      description
      task+
    ;
 description
-    : (TEXT EOL)*
+    : (EOL text)*
    ;
 task
-    : TASK_TOKEN S* amount TEXT EOL
+    : TASK_TOKEN S* amount text
      description
      subtask*
    ;
 subtask
-    : SUBTASK_TOKEN S* amount TEXT EOL
+    : SUBTASK_TOKEN S* amount text
      description
    ;
@ -50,15 +54,15 @@ amount
    ;
 PLAN_TOKEN
-    : '#'
+    : EOL+ '#'
    ;
 TASK_TOKEN
-    : '##'
+    : EOL+ '##'
    ;
 SUBTASK_TOKEN
-    : (SUBTASK_NEW_TOKEN | SUBTASK_PARTIAL_TOKEN | SUBTASK_COMPLETE_TOKEN | SUBTASK_OBSOLETE_TOKEN)
+    : EOL+ (SUBTASK_NEW_TOKEN | SUBTASK_PARTIAL_TOKEN | SUBTASK_COMPLETE_TOKEN | SUBTASK_OBSOLETE_TOKEN)
    ;
 SUBTASK_NEW_TOKEN
@ -78,7 +82,7 @@ SUBTASK_OBSOLETE_TOKEN
    ;
 S
-    : ' ' -> skip
+    : ' '
    ;
@ -106,22 +110,12 @@ END_AMOUNT
    : '}'
    ;
-// all special characters, including ' ' and digits are subtracted from the printable character range
+// Match printable characters, except space which is covered by S
-// '!' '#' '-' '/' '*'
+CHAR
-fragment STARTCHAR
+    : [!-~\u00A0-\u33FF] // ASCII and UNICODE
    : ["$-)+-,.:-z|~\u00A0-\u33FF]
    ;
-// A text should not end with a space, so the ENDHAR omits the space
+// This is a performance improvement that groups chars that do not have a special meaning
-fragment ENDCHAR
+WORD
-    : ["-~\u00A0-\u33FF]
+    : [A-Za-z\u00A0-\u33FF]+
    ;
 fragment CHAR
    : [ -~\u00A0-\u33FF] // ASCII and UNICODE
    ;
 // A text cannot start with a special character or has to be placed in quotes
 TEXT
    : STARTCHAR (CHAR* ENDCHAR)?
    ;