From 283c9728420470e15646f8a4ded2f5c06b3a9158 Mon Sep 17 00:00:00 2001 From: Mathias Jakobsen Date: Tue, 25 Jun 2024 09:05:41 +0100 Subject: [PATCH] Merge pull request #19032 from overleaf/mj-lezer-mathdelim [lezer] Remove custom tokeniser for MathDelimiter GitOrigin-RevId: acbbbe439d51a8a9d5b30b91c55c8f8ef8c3b9fb --- .../source-editor/lezer-latex/latex.grammar | 23 +++- .../source-editor/lezer-latex/tokens.mjs | 108 ------------------ 2 files changed, 19 insertions(+), 112 deletions(-) diff --git a/services/web/frontend/js/features/source-editor/lezer-latex/latex.grammar b/services/web/frontend/js/features/source-editor/lezer-latex/latex.grammar index 97cb2a7ac2..849c3f4439 100644 --- a/services/web/frontend/js/features/source-editor/lezer-latex/latex.grammar +++ b/services/web/frontend/js/features/source-editor/lezer-latex/latex.grammar @@ -25,10 +25,6 @@ VerbatimContent } -@external tokens mathDelimiterTokenizer from "./tokens.mjs" { - MathDelimiter -} - // external tokenizer to read control sequence names including @ signs // (which are often used in TeX definitions). @external tokens csnameTokenizer from "./tokens.mjs" { @@ -733,6 +729,25 @@ MathClosing { RightCtrlSeq optionalWhitespace? MathDelimiter } +MathDelimiter { + // Allowed delimiters, from the LaTeX manual, table 3.10 + "/" | "|" | "(" | ")" | "[" | "]" | + "\\{" | "\\}" | "\\|" | + "\\lfloor" | "\\rfloor" | + "\\lceil" | "\\rceil" | + "\\langle" | "\\rangle" | + "\\backslash" | "\\uparrow" | + "\\Uparrow" | "\\Downarrow" | + "\\updownarrow" | "\\Updownarrow" | + "\\downarrow" | "\\lvert" | + "\\lVert" | "\\rVert" | + "\\rvert" | "\\vert" | "\\Vert" | + "\\lbrace" | "\\rbrace" | + "\\lbrack" | "\\rbrack" | + // Also allow the empty match + "." +} + // NOTE: precedence works differently for rules and token, in the rule // you have to give a specifier !foo which is defined in the @precedence // block here. diff --git a/services/web/frontend/js/features/source-editor/lezer-latex/tokens.mjs b/services/web/frontend/js/features/source-editor/lezer-latex/tokens.mjs index e43da88cc3..84a86da6f6 100644 --- a/services/web/frontend/js/features/source-editor/lezer-latex/tokens.mjs +++ b/services/web/frontend/js/features/source-editor/lezer-latex/tokens.mjs @@ -11,7 +11,6 @@ import { Begin, End, KnownEnvironment, - MathDelimiter, Csname, TrailingWhitespaceOnly, TrailingContent, @@ -250,17 +249,6 @@ function _char(s) { return s.charCodeAt(0) } -// Allowed delimiters, from the LaTeX manual, table 3.10 -// ( ) [ ] / | \{ \} \| and additional names below -// The empty delimiter . is also allowed - -const CHAR_SLASH = _char('/') -const CHAR_PIPE = _char('|') -const CHAR_OPEN_PAREN = _char('(') -const CHAR_CLOSE_PAREN = _char(')') -const CHAR_OPEN_BRACKET = _char('[') -const CHAR_CLOSE_BRACKET = _char(']') -const CHAR_FULL_STOP = _char('.') const CHAR_BACKSLASH = _char('\\') const CHAR_OPEN_BRACE = _char('{') const CHAR_CLOSE_BRACE = _char('}') @@ -281,102 +269,6 @@ export const endOfArgumentListTokenizer = new ExternalTokenizer( { contextual: false, fallback: true } ) -const ALLOWED_DELIMITER_NAMES = [ - 'lfloor', - 'rfloor', - 'lceil', - 'rceil', - 'langle', - 'rangle', - 'backslash', - 'uparrow', - 'downarrow', - 'Uparrow', - 'Downarrow', - 'updownarrow', - 'Updownarrow', - 'lvert', - 'rvert', - 'lVert', - 'rVert', -] - -// Given a list of allowed command names, return those with leading characters that are the same as the matchString -function findPartialMatches(list, matchString) { - const size = matchString.length - return list.filter( - entry => entry.length >= size && entry.substring(0, size) === matchString - ) -} - -// tokenizer for \leftX ... \rightX delimiter tokens -export const mathDelimiterTokenizer = new ExternalTokenizer( - (input, stack) => { - let content = '' - let offset = 0 - let end = -1 - // look at the first character, we only accept the following /|()[]. - let next = input.peek(offset) - if (next === -1) { - return - } - if ( - next === CHAR_SLASH || - next === CHAR_PIPE || - next === CHAR_OPEN_PAREN || - next === CHAR_CLOSE_PAREN || - next === CHAR_OPEN_BRACKET || - next === CHAR_CLOSE_BRACKET || - next === CHAR_FULL_STOP - ) { - return input.acceptToken(MathDelimiter, 1) - } - // reject anything else not starting with a backslash, - // we only accept control symbols or control sequences - if (next !== CHAR_BACKSLASH) { - return - } - // look at the second character, we only accept \{ and \} and \| as control symbols - offset++ - next = input.peek(offset) - if (next === -1) { - return - } - if ( - next === CHAR_OPEN_BRACE || - next === CHAR_CLOSE_BRACE || - next === CHAR_PIPE - ) { - return input.acceptToken(MathDelimiter, 2) - } - // We haven't matched any symbols, so now try matching command names. - // Is this character a potential match to the remaining allowed delimiter names? - content = String.fromCharCode(next) - let candidates = findPartialMatches(ALLOWED_DELIMITER_NAMES, content) - if (!candidates.length) return - // we have some candidates, look at subsequent characters - offset++ - for (;;) { - const next = input.peek(offset) - // stop when we reach the end of file or a non-alphabetic character - if (next === -1 || !nameChar(next)) { - end = offset - 1 - break - } - content += String.fromCharCode(next) - // find how many candidates remain with the new input - candidates = findPartialMatches(candidates, content) - if (!candidates.length) return // no matches remaining - end = offset - offset++ - } - if (!candidates.includes(content)) return // not a valid delimiter - // accept the content as a valid delimiter - return input.acceptToken(MathDelimiter, end + 1) - }, - { contextual: false } -) - const CHAR_AT_SYMBOL = _char('@') export const csnameTokenizer = new ExternalTokenizer((input, stack) => {