mirror of
https://github.com/overleaf/overleaf.git
synced 2024-11-21 20:47:08 -05:00
Use Intl.Segmenter when available for identifying words to spell check (#21225)
GitOrigin-RevId: f261ae331e0b1f87f4e88a512b25c8798f0b02c6
This commit is contained in:
parent
04dbb7d2f2
commit
6c4fac68ca
2 changed files with 70 additions and 27 deletions
|
@ -7,10 +7,7 @@ import { spellCheckRequest } from './backend'
|
||||||
import { EditorView, ViewUpdate } from '@codemirror/view'
|
import { EditorView, ViewUpdate } from '@codemirror/view'
|
||||||
import { ChangeSet, Line, Range, RangeValue } from '@codemirror/state'
|
import { ChangeSet, Line, Range, RangeValue } from '@codemirror/state'
|
||||||
import { IgnoredWords } from '../../../dictionary/ignored-words'
|
import { IgnoredWords } from '../../../dictionary/ignored-words'
|
||||||
import {
|
import { getNormalTextSpansFromLine } from '../../utils/tree-query'
|
||||||
getNormalTextSpansFromLine,
|
|
||||||
NormalTextSpan,
|
|
||||||
} from '../../utils/tree-query'
|
|
||||||
import { waitForParser } from '../wait-for-parser'
|
import { waitForParser } from '../wait-for-parser'
|
||||||
import { debugConsole } from '@/utils/debugging'
|
import { debugConsole } from '@/utils/debugging'
|
||||||
import type { HunspellManager } from '../../hunspell/HunspellManager'
|
import type { HunspellManager } from '../../hunspell/HunspellManager'
|
||||||
|
@ -26,6 +23,7 @@ export class SpellChecker {
|
||||||
private waitingForParser = false
|
private waitingForParser = false
|
||||||
private firstCheckPending = false
|
private firstCheckPending = false
|
||||||
private trackedChanges: ChangeSet
|
private trackedChanges: ChangeSet
|
||||||
|
private readonly segmenter?: Intl.Segmenter
|
||||||
|
|
||||||
// eslint-disable-next-line no-useless-constructor
|
// eslint-disable-next-line no-useless-constructor
|
||||||
constructor(
|
constructor(
|
||||||
|
@ -34,6 +32,32 @@ export class SpellChecker {
|
||||||
) {
|
) {
|
||||||
debugConsole.log('SpellChecker', language, hunspellManager)
|
debugConsole.log('SpellChecker', language, hunspellManager)
|
||||||
this.trackedChanges = ChangeSet.empty(0)
|
this.trackedChanges = ChangeSet.empty(0)
|
||||||
|
|
||||||
|
const locale = language.replace(/_/, '-')
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (Intl.Segmenter) {
|
||||||
|
const supportedLocales = Intl.Segmenter.supportedLocalesOf([locale], {
|
||||||
|
localeMatcher: 'lookup',
|
||||||
|
})
|
||||||
|
|
||||||
|
if (supportedLocales.includes(locale)) {
|
||||||
|
this.segmenter = new Intl.Segmenter(locale, {
|
||||||
|
localeMatcher: 'lookup',
|
||||||
|
granularity: 'word',
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
// ignore, not supported for some reason
|
||||||
|
debugConsole.error(error)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.segmenter) {
|
||||||
|
debugConsole.log(`Using Intl.Segmenter for ${locale}`)
|
||||||
|
} else {
|
||||||
|
debugConsole.warn(`Not using Intl.Segmenter for ${locale}`)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
destroy() {
|
destroy() {
|
||||||
|
@ -257,7 +281,13 @@ export class SpellChecker {
|
||||||
for (const i of changedLineNumbers) {
|
for (const i of changedLineNumbers) {
|
||||||
const line = view.state.doc.line(i)
|
const line = view.state.doc.line(i)
|
||||||
wordsToCheck.push(
|
wordsToCheck.push(
|
||||||
...getWordsFromLine(view, line, ignoredWords, this.language)
|
...getWordsFromLine(
|
||||||
|
view,
|
||||||
|
line,
|
||||||
|
ignoredWords,
|
||||||
|
this.language,
|
||||||
|
this.segmenter
|
||||||
|
)
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -347,35 +377,48 @@ export const compileEffects = (results: {
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
export const getWordsFromLine = (
|
export function* getWordsFromLine(
|
||||||
view: EditorView,
|
view: EditorView,
|
||||||
line: Line,
|
line: Line,
|
||||||
ignoredWords: IgnoredWords | null,
|
ignoredWords: IgnoredWords | null,
|
||||||
lang: string
|
lang: string,
|
||||||
): Word[] => {
|
segmenter?: Intl.Segmenter
|
||||||
const normalTextSpans: Array<NormalTextSpan> = getNormalTextSpansFromLine(
|
) {
|
||||||
view,
|
for (const span of getNormalTextSpansFromLine(view, line)) {
|
||||||
line
|
if (segmenter) {
|
||||||
)
|
for (const value of segmenter.segment(span.text)) {
|
||||||
const words: Word[] = []
|
if (value.isWordLike && !ignoredWords?.has(value.segment)) {
|
||||||
for (const span of normalTextSpans) {
|
const word = value.segment
|
||||||
for (const match of span.text.matchAll(WORD_REGEX)) {
|
const from = span.from + value.index
|
||||||
const word = match[0].replace(/^'+/, '').replace(/'+$/, '')
|
yield new Word({
|
||||||
if (!ignoredWords?.has(word)) {
|
|
||||||
const from = span.from + match.index
|
|
||||||
words.push(
|
|
||||||
new Word({
|
|
||||||
text: word,
|
text: word,
|
||||||
from,
|
from,
|
||||||
to: from + word.length,
|
to: from + word.length,
|
||||||
lineNumber: line.number,
|
lineNumber: line.number,
|
||||||
lang,
|
lang,
|
||||||
})
|
})
|
||||||
)
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (const match of span.text.matchAll(WORD_REGEX)) {
|
||||||
|
let word = match[0].replace(/'+$/, '')
|
||||||
|
let from = span.from + match.index
|
||||||
|
while (word.startsWith("'")) {
|
||||||
|
word = word.slice(1)
|
||||||
|
from++
|
||||||
|
}
|
||||||
|
if (!ignoredWords?.has(word)) {
|
||||||
|
yield new Word({
|
||||||
|
text: word,
|
||||||
|
from,
|
||||||
|
to: from + word.length,
|
||||||
|
lineNumber: line.number,
|
||||||
|
lang,
|
||||||
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return words
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export type Mark = Range<RangeValue & { spec: { word: Word } }>
|
export type Mark = Range<RangeValue & { spec: { word: Word } }>
|
||||||
|
|
|
@ -26,7 +26,7 @@ describe('SpellChecker', function () {
|
||||||
extensions,
|
extensions,
|
||||||
})
|
})
|
||||||
const line = view.state.doc.line(1)
|
const line = view.state.doc.line(1)
|
||||||
const words = getWordsFromLine(view, line, ignoredWords, lang)
|
const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang))
|
||||||
expect(words).to.deep.equal([
|
expect(words).to.deep.equal([
|
||||||
{ text: 'Hello', from: 0, to: 5, lineNumber: 1, lang: 'en' },
|
{ text: 'Hello', from: 0, to: 5, lineNumber: 1, lang: 'en' },
|
||||||
{ text: 'test', from: 6, to: 10, lineNumber: 1, lang: 'en' },
|
{ text: 'test', from: 6, to: 10, lineNumber: 1, lang: 'en' },
|
||||||
|
@ -42,7 +42,7 @@ describe('SpellChecker', function () {
|
||||||
extensions,
|
extensions,
|
||||||
})
|
})
|
||||||
const line = view.state.doc.line(1)
|
const line = view.state.doc.line(1)
|
||||||
const words = getWordsFromLine(view, line, ignoredWords, lang)
|
const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang))
|
||||||
expect(words).to.deep.equal([
|
expect(words).to.deep.equal([
|
||||||
{ text: 'Hello', from: 0, to: 5, lineNumber: 1, lang: 'en' },
|
{ text: 'Hello', from: 0, to: 5, lineNumber: 1, lang: 'en' },
|
||||||
{ text: 'one', from: 11, to: 14, lineNumber: 1, lang: 'en' },
|
{ text: 'one', from: 11, to: 14, lineNumber: 1, lang: 'en' },
|
||||||
|
@ -56,7 +56,7 @@ describe('SpellChecker', function () {
|
||||||
extensions,
|
extensions,
|
||||||
})
|
})
|
||||||
const line = view.state.doc.line(1)
|
const line = view.state.doc.line(1)
|
||||||
const words = getWordsFromLine(view, line, ignoredWords, lang)
|
const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang))
|
||||||
expect(words).to.deep.equal([])
|
expect(words).to.deep.equal([])
|
||||||
})
|
})
|
||||||
|
|
||||||
|
@ -66,7 +66,7 @@ describe('SpellChecker', function () {
|
||||||
extensions,
|
extensions,
|
||||||
})
|
})
|
||||||
const line = view.state.doc.line(1)
|
const line = view.state.doc.line(1)
|
||||||
const words = getWordsFromLine(view, line, ignoredWords, lang)
|
const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang))
|
||||||
expect(words).to.deep.equal([
|
expect(words).to.deep.equal([
|
||||||
{ text: 'seven', from: 24, to: 29, lineNumber: 1, lang: 'en' },
|
{ text: 'seven', from: 24, to: 29, lineNumber: 1, lang: 'en' },
|
||||||
{ text: 'eight', from: 30, to: 35, lineNumber: 1, lang: 'en' },
|
{ text: 'eight', from: 30, to: 35, lineNumber: 1, lang: 'en' },
|
||||||
|
@ -79,7 +79,7 @@ describe('SpellChecker', function () {
|
||||||
extensions,
|
extensions,
|
||||||
})
|
})
|
||||||
const line = view.state.doc.line(1)
|
const line = view.state.doc.line(1)
|
||||||
const words = getWordsFromLine(view, line, ignoredWords, lang)
|
const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang))
|
||||||
expect(words).to.deep.equal([
|
expect(words).to.deep.equal([
|
||||||
{ text: 'nine', from: 5, to: 9, lineNumber: 1, lang: 'en' },
|
{ text: 'nine', from: 5, to: 9, lineNumber: 1, lang: 'en' },
|
||||||
{ text: 'ten', from: 15, to: 18, lineNumber: 1, lang: 'en' },
|
{ text: 'ten', from: 15, to: 18, lineNumber: 1, lang: 'en' },
|
||||||
|
|
Loading…
Reference in a new issue