Merge pull request #19032 from overleaf/mj-lezer-mathdelim

[lezer] Remove custom tokeniser for MathDelimiter GitOrigin-RevId: acbbbe439d51a8a9d5b30b91c55c8f8ef8c3b9fb
2024-11-07 20:31:06 -05:00 · 2024-06-25 09:05:41 +01:00 · 2024-06-25 09:05:41 +01:00 · 283c972842
commit 283c972842
parent 884eebd82d
2 changed files with 19 additions and 112 deletions
--- a/services/web/frontend/js/features/source-editor/lezer-latex/latex.grammar
+++ b/services/web/frontend/js/features/source-editor/lezer-latex/latex.grammar
@ -25,10 +25,6 @@
    VerbatimContent
 }
@external tokens mathDelimiterTokenizer from "./tokens.mjs" {
    MathDelimiter
 }
 // external tokenizer to read control sequence names including @ signs
 // (which are often used in TeX definitions).
@external tokens csnameTokenizer from "./tokens.mjs" {
@ -733,6 +729,25 @@ MathClosing {
    RightCtrlSeq optionalWhitespace? MathDelimiter
 }
 MathDelimiter {
    // Allowed delimiters, from the LaTeX manual, table 3.10
    "/" | "|" | "(" | ")" | "[" | "]" |
   "\\{" | "\\}" | "\\|" |
    "\\lfloor" | "\\rfloor" |
    "\\lceil" | "\\rceil" |
    "\\langle" | "\\rangle" |
    "\\backslash" | "\\uparrow" |
    "\\Uparrow" | "\\Downarrow" |
    "\\updownarrow" | "\\Updownarrow" |
    "\\downarrow" | "\\lvert" |
    "\\lVert" | "\\rVert" |
    "\\rvert" | "\\vert" | "\\Vert" |
    "\\lbrace" | "\\rbrace" |
    "\\lbrack" | "\\rbrack" |
    // Also allow the empty match
    "."
 }
 // NOTE: precedence works differently for rules and token, in the rule
 // you have to give a specifier !foo which is defined in the @precedence
 // block here.
--- a/services/web/frontend/js/features/source-editor/lezer-latex/tokens.mjs
+++ b/services/web/frontend/js/features/source-editor/lezer-latex/tokens.mjs
@ -11,7 +11,6 @@ import {
  Begin,
  End,
  KnownEnvironment,
  MathDelimiter,
  Csname,
  TrailingWhitespaceOnly,
  TrailingContent,
@ -250,17 +249,6 @@ function _char(s) {
  return s.charCodeAt(0)
 }
 // Allowed delimiters, from the LaTeX manual, table 3.10
 // (  ) [ ] / |  \{ \}  \| and additional names below
 // The empty delimiter . is also allowed
 const CHAR_SLASH = _char('/')
 const CHAR_PIPE = _char('|')
 const CHAR_OPEN_PAREN = _char('(')
 const CHAR_CLOSE_PAREN = _char(')')
 const CHAR_OPEN_BRACKET = _char('[')
 const CHAR_CLOSE_BRACKET = _char(']')
 const CHAR_FULL_STOP = _char('.')
 const CHAR_BACKSLASH = _char('\\')
 const CHAR_OPEN_BRACE = _char('{')
 const CHAR_CLOSE_BRACE = _char('}')
@ -281,102 +269,6 @@ export const endOfArgumentListTokenizer = new ExternalTokenizer(
  { contextual: false, fallback: true }
 )
 const ALLOWED_DELIMITER_NAMES = [
  'lfloor',
  'rfloor',
  'lceil',
  'rceil',
  'langle',
  'rangle',
  'backslash',
  'uparrow',
  'downarrow',
  'Uparrow',
  'Downarrow',
  'updownarrow',
  'Updownarrow',
  'lvert',
  'rvert',
  'lVert',
  'rVert',
 ]
 // Given a list of allowed command names, return those with leading characters that are the same as the matchString
 function findPartialMatches(list, matchString) {
  const size = matchString.length
  return list.filter(
    entry => entry.length >= size && entry.substring(0, size) === matchString
  )
 }
 // tokenizer for \leftX ... \rightX delimiter tokens
 export const mathDelimiterTokenizer = new ExternalTokenizer(
  (input, stack) => {
    let content = ''
    let offset = 0
    let end = -1
    // look at the first character, we only accept the following /|()[].
    let next = input.peek(offset)
    if (next === -1) {
      return
    }
    if (
      next === CHAR_SLASH ||
      next === CHAR_PIPE ||
      next === CHAR_OPEN_PAREN ||
      next === CHAR_CLOSE_PAREN ||
      next === CHAR_OPEN_BRACKET ||
      next === CHAR_CLOSE_BRACKET ||
      next === CHAR_FULL_STOP
    ) {
      return input.acceptToken(MathDelimiter, 1)
    }
    // reject anything else not starting with a backslash,
    // we only accept control symbols or control sequences
    if (next !== CHAR_BACKSLASH) {
      return
    }
    // look at the second character, we only accept \{ and \} and \| as control symbols
    offset++
    next = input.peek(offset)
    if (next === -1) {
      return
    }
    if (
      next === CHAR_OPEN_BRACE ||
      next === CHAR_CLOSE_BRACE ||
      next === CHAR_PIPE
    ) {
      return input.acceptToken(MathDelimiter, 2)
    }
    // We haven't matched any symbols, so now try matching command names.
    // Is this character a potential match to the remaining allowed delimiter names?
    content = String.fromCharCode(next)
    let candidates = findPartialMatches(ALLOWED_DELIMITER_NAMES, content)
    if (!candidates.length) return
    // we have some candidates, look at subsequent characters
    offset++
    for (;;) {
      const next = input.peek(offset)
      // stop when we reach the end of file or a non-alphabetic character
      if (next === -1 || !nameChar(next)) {
        end = offset - 1
        break
      }
      content += String.fromCharCode(next)
      // find how many candidates remain with the new input
      candidates = findPartialMatches(candidates, content)
      if (!candidates.length) return // no matches remaining
      end = offset
      offset++
    }
    if (!candidates.includes(content)) return // not a valid delimiter
    // accept the content as a valid delimiter
    return input.acceptToken(MathDelimiter, end + 1)
  },
  { contextual: false }
 )
 const CHAR_AT_SYMBOL = _char('@')
 export const csnameTokenizer = new ExternalTokenizer((input, stack) => {