Small text cleanups

This commit is contained in:
Jos van den Oever 2024-09-04 17:26:57 +02:00
parent 3bf56e0345
commit 941aef502b

View file

@ -1,13 +1,14 @@
/** /**
* This file defines the grammar for takentaal. * This file defines the grammar for takentaal.
* It is divided into parser rules (lowercase) and lexer rules (uppercase). * It is divided into parser rules (lowercase names) and lexer rules (uppercase
* names).
* The parser splits an input into tokens accoring to the lexer rules. * The parser splits an input into tokens accoring to the lexer rules.
* At any point, all lexer rules are considered. If multiple rules match, * At any point in the input text, all lexer rules are considered. If multiple
* a lexer rule is chosen as follows: * rules match, a lexer rule is chosen as follows:
* - the rule that matches the longest input is chosen * - the rule that matches the longest input is chosen
* - any implicit rule, e.g. 'a', is chosen * - any implicit rule, e.g. 'a', is chosen
* - the first defined rule is chosen. * - the first defined rule is chosen.
* Since this grammar has to match unquoted texts and text are usually longer * Since this grammar has to match unquoted texts and texts are usually longer
* than other token matches, texts are split into characters so that they have * than other token matches, texts are split into characters so that they have
* a lower ranking. * a lower ranking.
*/ */
@ -69,8 +70,8 @@ a1_0_subtask_token
| SUBTASK_OBSOLETE_TOKEN | SUBTASK_OBSOLETE_TOKEN
; ;
// Any implicit and explity lexer token that may appear in a text should be listed // Any implicit and explity lexer token that may appear in a text should be
// in this definition. // listed in this definition.
text text
: (INT | '{' | '}' | S | CHAR | WORD)+ : (INT | '{' | '}' | S | CHAR | WORD)+
; ;
@ -107,5 +108,6 @@ fragment DIGIT : [0-9] ;
// Match printable characters, except space which is covered by S // Match printable characters, except space which is covered by S
CHAR : [!-~\u00A0-\u33FF] ; // ASCII and UNICODE CHAR : [!-~\u00A0-\u33FF] ; // ASCII and UNICODE
// This is a performance improvement that groups chars that do not have a special meaning // This is a performance improvement that groups chars that do not have a
// special meaning
WORD : [A-Za-z\u00A0-\u33FF]+ ; WORD : [A-Za-z\u00A0-\u33FF]+ ;