Split description text into character tokens so that text has a lower ranking

2025-10-27 01:49:22 +00:00 · 2024-09-02 14:00:51 +02:00 · 2024-09-02 14:00:51 +02:00 · f9e92b8152
commit f9e92b8152
parent c0b60b1c4f
1 changed files with 21 additions and 27 deletions
--- a/takentaal.g4
+++ b/takentaal.g4
@ -8,8 +8,8 @@
 *  - any implicit rule, e.g. 'a', is chosen
 *  - the first defined rule is chosen.
 * Since this grammar has to match unquoted texts and text are usually longer
- * than other token matches, the TEXT rule disallows many characters as the
- * first character to start with.
+ * than other token matches, texts are split into characters so that they have
+ * a lower ranking.
 */

 grammar takentaal;
@ -20,27 +20,31 @@ takentaal
    ;

 header
-    : 'takentaal v0.1.0' EOL
+    : 'takentaal v0.1.0'
+    ;
+
+text
+    : (S | CHAR | WORD)+
    ;

 plan
-    : PLAN_TOKEN S* amount TEXT EOL
+    : PLAN_TOKEN S* amount text
      description
      task+
    ;

 description
-    : (TEXT EOL)*
+    : (EOL text)*
    ;

 task
-    : TASK_TOKEN S* amount TEXT EOL
+    : TASK_TOKEN S* amount text
      description
      subtask*
    ;

 subtask
-    : SUBTASK_TOKEN S* amount TEXT EOL
+    : SUBTASK_TOKEN S* amount text
      description
    ;

@ -50,15 +54,15 @@ amount
    ;

 PLAN_TOKEN
-    : '#'
+    : EOL+ '#'
    ;

 TASK_TOKEN
-    : '##'
+    : EOL+ '##'
    ;

 SUBTASK_TOKEN
-    : (SUBTASK_NEW_TOKEN | SUBTASK_PARTIAL_TOKEN | SUBTASK_COMPLETE_TOKEN | SUBTASK_OBSOLETE_TOKEN)
+    : EOL+ (SUBTASK_NEW_TOKEN | SUBTASK_PARTIAL_TOKEN | SUBTASK_COMPLETE_TOKEN | SUBTASK_OBSOLETE_TOKEN)
    ;

 SUBTASK_NEW_TOKEN
@ -78,7 +82,7 @@ SUBTASK_OBSOLETE_TOKEN
    ;

 S
-    : ' ' -> skip
+    : ' '
    ;
    

@ -106,22 +110,12 @@ END_AMOUNT
    : '}'
    ;

-// all special characters, including ' ' and digits are subtracted from the printable character range
-// '!' '#' '-' '/' '*'
-fragment STARTCHAR
-    : ["$-)+-,.:-z|~\u00A0-\u33FF]
+// Match printable characters, except space which is covered by S
+CHAR
+    : [!-~\u00A0-\u33FF] // ASCII and UNICODE
    ;

-// A text should not end with a space, so the ENDHAR omits the space
-fragment ENDCHAR
-    : ["-~\u00A0-\u33FF]
-    ;
-
-fragment CHAR
-    : [ -~\u00A0-\u33FF] // ASCII and UNICODE
-    ;
-
-// A text cannot start with a special character or has to be placed in quotes
-TEXT
-    : STARTCHAR (CHAR* ENDCHAR)?
+// This is a performance improvement that groups chars that do not have a special meaning
+WORD
+    : [A-Za-z\u00A0-\u33FF]+
    ;