Merge pull request #19032 from overleaf/mj-lezer-mathdelim

[lezer] Remove custom tokeniser for MathDelimiter GitOrigin-RevId: acbbbe439d51a8a9d5b30b91c55c8f8ef8c3b9fb
2024-09-16 02:52:31 -04:00 · 2024-06-25 09:05:41 +01:00 · 2024-06-25 09:05:41 +01:00 · 283c972842
commit 283c972842
parent 884eebd82d
2 changed files with 19 additions and 112 deletions
--- a/services/web/frontend/js/features/source-editor/lezer-latex/latex.grammar
+++ b/services/web/frontend/js/features/source-editor/lezer-latex/latex.grammar
@ -25,10 +25,6 @@
    VerbatimContent
 }

-@external tokens mathDelimiterTokenizer from "./tokens.mjs" {
-    MathDelimiter
-}
-
 // external tokenizer to read control sequence names including @ signs
 // (which are often used in TeX definitions).
@external tokens csnameTokenizer from "./tokens.mjs" {
@ -733,6 +729,25 @@ MathClosing {
    RightCtrlSeq optionalWhitespace? MathDelimiter
 }

+MathDelimiter {
+    // Allowed delimiters, from the LaTeX manual, table 3.10
+    "/" | "|" | "(" | ")" | "[" | "]" |
+   "\\{" | "\\}" | "\\|" |
+    "\\lfloor" | "\\rfloor" |
+    "\\lceil" | "\\rceil" |
+    "\\langle" | "\\rangle" |
+    "\\backslash" | "\\uparrow" |
+    "\\Uparrow" | "\\Downarrow" |
+    "\\updownarrow" | "\\Updownarrow" |
+    "\\downarrow" | "\\lvert" |
+    "\\lVert" | "\\rVert" |
+    "\\rvert" | "\\vert" | "\\Vert" |
+    "\\lbrace" | "\\rbrace" |
+    "\\lbrack" | "\\rbrack" |
+    // Also allow the empty match
+    "."
+}
+
 // NOTE: precedence works differently for rules and token, in the rule
 // you have to give a specifier !foo which is defined in the @precedence
 // block here.
--- a/services/web/frontend/js/features/source-editor/lezer-latex/tokens.mjs
+++ b/services/web/frontend/js/features/source-editor/lezer-latex/tokens.mjs
@ -11,7 +11,6 @@ import {
  Begin,
  End,
  KnownEnvironment,
-  MathDelimiter,
  Csname,
  TrailingWhitespaceOnly,
  TrailingContent,
@ -250,17 +249,6 @@ function _char(s) {
  return s.charCodeAt(0)
 }

-// Allowed delimiters, from the LaTeX manual, table 3.10
-// (  ) [ ] / |  \{ \}  \| and additional names below
-// The empty delimiter . is also allowed
-
-const CHAR_SLASH = _char('/')
-const CHAR_PIPE = _char('|')
-const CHAR_OPEN_PAREN = _char('(')
-const CHAR_CLOSE_PAREN = _char(')')
-const CHAR_OPEN_BRACKET = _char('[')
-const CHAR_CLOSE_BRACKET = _char(']')
-const CHAR_FULL_STOP = _char('.')
 const CHAR_BACKSLASH = _char('\\')
 const CHAR_OPEN_BRACE = _char('{')
 const CHAR_CLOSE_BRACE = _char('}')
@ -281,102 +269,6 @@ export const endOfArgumentListTokenizer = new ExternalTokenizer(
  { contextual: false, fallback: true }
 )

-const ALLOWED_DELIMITER_NAMES = [
-  'lfloor',
-  'rfloor',
-  'lceil',
-  'rceil',
-  'langle',
-  'rangle',
-  'backslash',
-  'uparrow',
-  'downarrow',
-  'Uparrow',
-  'Downarrow',
-  'updownarrow',
-  'Updownarrow',
-  'lvert',
-  'rvert',
-  'lVert',
-  'rVert',
-]
-
-// Given a list of allowed command names, return those with leading characters that are the same as the matchString
-function findPartialMatches(list, matchString) {
-  const size = matchString.length
-  return list.filter(
-    entry => entry.length >= size && entry.substring(0, size) === matchString
-  )
-}
-
-// tokenizer for \leftX ... \rightX delimiter tokens
-export const mathDelimiterTokenizer = new ExternalTokenizer(
-  (input, stack) => {
-    let content = ''
-    let offset = 0
-    let end = -1
-    // look at the first character, we only accept the following /|()[].
-    let next = input.peek(offset)
-    if (next === -1) {
-      return
-    }
-    if (
-      next === CHAR_SLASH ||
-      next === CHAR_PIPE ||
-      next === CHAR_OPEN_PAREN ||
-      next === CHAR_CLOSE_PAREN ||
-      next === CHAR_OPEN_BRACKET ||
-      next === CHAR_CLOSE_BRACKET ||
-      next === CHAR_FULL_STOP
-    ) {
-      return input.acceptToken(MathDelimiter, 1)
-    }
-    // reject anything else not starting with a backslash,
-    // we only accept control symbols or control sequences
-    if (next !== CHAR_BACKSLASH) {
-      return
-    }
-    // look at the second character, we only accept \{ and \} and \| as control symbols
-    offset++
-    next = input.peek(offset)
-    if (next === -1) {
-      return
-    }
-    if (
-      next === CHAR_OPEN_BRACE ||
-      next === CHAR_CLOSE_BRACE ||
-      next === CHAR_PIPE
-    ) {
-      return input.acceptToken(MathDelimiter, 2)
-    }
-    // We haven't matched any symbols, so now try matching command names.
-    // Is this character a potential match to the remaining allowed delimiter names?
-    content = String.fromCharCode(next)
-    let candidates = findPartialMatches(ALLOWED_DELIMITER_NAMES, content)
-    if (!candidates.length) return
-    // we have some candidates, look at subsequent characters
-    offset++
-    for (;;) {
-      const next = input.peek(offset)
-      // stop when we reach the end of file or a non-alphabetic character
-      if (next === -1 || !nameChar(next)) {
-        end = offset - 1
-        break
-      }
-      content += String.fromCharCode(next)
-      // find how many candidates remain with the new input
-      candidates = findPartialMatches(candidates, content)
-      if (!candidates.length) return // no matches remaining
-      end = offset
-      offset++
-    }
-    if (!candidates.includes(content)) return // not a valid delimiter
-    // accept the content as a valid delimiter
-    return input.acceptToken(MathDelimiter, end + 1)
-  },
-  { contextual: false }
-)
-
 const CHAR_AT_SYMBOL = _char('@')

 export const csnameTokenizer = new ExternalTokenizer((input, stack) => {