Merge pull request #19032 from overleaf/mj-lezer-mathdelim

[lezer] Remove custom tokeniser for MathDelimiter

GitOrigin-RevId: acbbbe439d51a8a9d5b30b91c55c8f8ef8c3b9fb
This commit is contained in:
Mathias Jakobsen 2024-06-25 09:05:41 +01:00 committed by Copybot
parent 884eebd82d
commit 283c972842
2 changed files with 19 additions and 112 deletions

View file

@ -25,10 +25,6 @@
VerbatimContent VerbatimContent
} }
@external tokens mathDelimiterTokenizer from "./tokens.mjs" {
MathDelimiter
}
// external tokenizer to read control sequence names including @ signs // external tokenizer to read control sequence names including @ signs
// (which are often used in TeX definitions). // (which are often used in TeX definitions).
@external tokens csnameTokenizer from "./tokens.mjs" { @external tokens csnameTokenizer from "./tokens.mjs" {
@ -733,6 +729,25 @@ MathClosing {
RightCtrlSeq optionalWhitespace? MathDelimiter RightCtrlSeq optionalWhitespace? MathDelimiter
} }
MathDelimiter {
// Allowed delimiters, from the LaTeX manual, table 3.10
"/" | "|" | "(" | ")" | "[" | "]" |
"\\{" | "\\}" | "\\|" |
"\\lfloor" | "\\rfloor" |
"\\lceil" | "\\rceil" |
"\\langle" | "\\rangle" |
"\\backslash" | "\\uparrow" |
"\\Uparrow" | "\\Downarrow" |
"\\updownarrow" | "\\Updownarrow" |
"\\downarrow" | "\\lvert" |
"\\lVert" | "\\rVert" |
"\\rvert" | "\\vert" | "\\Vert" |
"\\lbrace" | "\\rbrace" |
"\\lbrack" | "\\rbrack" |
// Also allow the empty match
"."
}
// NOTE: precedence works differently for rules and token, in the rule // NOTE: precedence works differently for rules and token, in the rule
// you have to give a specifier !foo which is defined in the @precedence // you have to give a specifier !foo which is defined in the @precedence
// block here. // block here.

View file

@ -11,7 +11,6 @@ import {
Begin, Begin,
End, End,
KnownEnvironment, KnownEnvironment,
MathDelimiter,
Csname, Csname,
TrailingWhitespaceOnly, TrailingWhitespaceOnly,
TrailingContent, TrailingContent,
@ -250,17 +249,6 @@ function _char(s) {
return s.charCodeAt(0) return s.charCodeAt(0)
} }
// Allowed delimiters, from the LaTeX manual, table 3.10
// ( ) [ ] / | \{ \} \| and additional names below
// The empty delimiter . is also allowed
const CHAR_SLASH = _char('/')
const CHAR_PIPE = _char('|')
const CHAR_OPEN_PAREN = _char('(')
const CHAR_CLOSE_PAREN = _char(')')
const CHAR_OPEN_BRACKET = _char('[')
const CHAR_CLOSE_BRACKET = _char(']')
const CHAR_FULL_STOP = _char('.')
const CHAR_BACKSLASH = _char('\\') const CHAR_BACKSLASH = _char('\\')
const CHAR_OPEN_BRACE = _char('{') const CHAR_OPEN_BRACE = _char('{')
const CHAR_CLOSE_BRACE = _char('}') const CHAR_CLOSE_BRACE = _char('}')
@ -281,102 +269,6 @@ export const endOfArgumentListTokenizer = new ExternalTokenizer(
{ contextual: false, fallback: true } { contextual: false, fallback: true }
) )
const ALLOWED_DELIMITER_NAMES = [
'lfloor',
'rfloor',
'lceil',
'rceil',
'langle',
'rangle',
'backslash',
'uparrow',
'downarrow',
'Uparrow',
'Downarrow',
'updownarrow',
'Updownarrow',
'lvert',
'rvert',
'lVert',
'rVert',
]
// Given a list of allowed command names, return those with leading characters that are the same as the matchString
function findPartialMatches(list, matchString) {
const size = matchString.length
return list.filter(
entry => entry.length >= size && entry.substring(0, size) === matchString
)
}
// tokenizer for \leftX ... \rightX delimiter tokens
export const mathDelimiterTokenizer = new ExternalTokenizer(
(input, stack) => {
let content = ''
let offset = 0
let end = -1
// look at the first character, we only accept the following /|()[].
let next = input.peek(offset)
if (next === -1) {
return
}
if (
next === CHAR_SLASH ||
next === CHAR_PIPE ||
next === CHAR_OPEN_PAREN ||
next === CHAR_CLOSE_PAREN ||
next === CHAR_OPEN_BRACKET ||
next === CHAR_CLOSE_BRACKET ||
next === CHAR_FULL_STOP
) {
return input.acceptToken(MathDelimiter, 1)
}
// reject anything else not starting with a backslash,
// we only accept control symbols or control sequences
if (next !== CHAR_BACKSLASH) {
return
}
// look at the second character, we only accept \{ and \} and \| as control symbols
offset++
next = input.peek(offset)
if (next === -1) {
return
}
if (
next === CHAR_OPEN_BRACE ||
next === CHAR_CLOSE_BRACE ||
next === CHAR_PIPE
) {
return input.acceptToken(MathDelimiter, 2)
}
// We haven't matched any symbols, so now try matching command names.
// Is this character a potential match to the remaining allowed delimiter names?
content = String.fromCharCode(next)
let candidates = findPartialMatches(ALLOWED_DELIMITER_NAMES, content)
if (!candidates.length) return
// we have some candidates, look at subsequent characters
offset++
for (;;) {
const next = input.peek(offset)
// stop when we reach the end of file or a non-alphabetic character
if (next === -1 || !nameChar(next)) {
end = offset - 1
break
}
content += String.fromCharCode(next)
// find how many candidates remain with the new input
candidates = findPartialMatches(candidates, content)
if (!candidates.length) return // no matches remaining
end = offset
offset++
}
if (!candidates.includes(content)) return // not a valid delimiter
// accept the content as a valid delimiter
return input.acceptToken(MathDelimiter, end + 1)
},
{ contextual: false }
)
const CHAR_AT_SYMBOL = _char('@') const CHAR_AT_SYMBOL = _char('@')
export const csnameTokenizer = new ExternalTokenizer((input, stack) => { export const csnameTokenizer = new ExternalTokenizer((input, stack) => {