mirror of
https://github.com/overleaf/overleaf.git
synced 2024-11-21 20:47:08 -05:00
Use Intl.Segmenter when available for identifying words to spell check (#21225)
GitOrigin-RevId: f261ae331e0b1f87f4e88a512b25c8798f0b02c6
This commit is contained in:
parent
04dbb7d2f2
commit
6c4fac68ca
2 changed files with 70 additions and 27 deletions
|
@ -7,10 +7,7 @@ import { spellCheckRequest } from './backend'
|
|||
import { EditorView, ViewUpdate } from '@codemirror/view'
|
||||
import { ChangeSet, Line, Range, RangeValue } from '@codemirror/state'
|
||||
import { IgnoredWords } from '../../../dictionary/ignored-words'
|
||||
import {
|
||||
getNormalTextSpansFromLine,
|
||||
NormalTextSpan,
|
||||
} from '../../utils/tree-query'
|
||||
import { getNormalTextSpansFromLine } from '../../utils/tree-query'
|
||||
import { waitForParser } from '../wait-for-parser'
|
||||
import { debugConsole } from '@/utils/debugging'
|
||||
import type { HunspellManager } from '../../hunspell/HunspellManager'
|
||||
|
@ -26,6 +23,7 @@ export class SpellChecker {
|
|||
private waitingForParser = false
|
||||
private firstCheckPending = false
|
||||
private trackedChanges: ChangeSet
|
||||
private readonly segmenter?: Intl.Segmenter
|
||||
|
||||
// eslint-disable-next-line no-useless-constructor
|
||||
constructor(
|
||||
|
@ -34,6 +32,32 @@ export class SpellChecker {
|
|||
) {
|
||||
debugConsole.log('SpellChecker', language, hunspellManager)
|
||||
this.trackedChanges = ChangeSet.empty(0)
|
||||
|
||||
const locale = language.replace(/_/, '-')
|
||||
|
||||
try {
|
||||
if (Intl.Segmenter) {
|
||||
const supportedLocales = Intl.Segmenter.supportedLocalesOf([locale], {
|
||||
localeMatcher: 'lookup',
|
||||
})
|
||||
|
||||
if (supportedLocales.includes(locale)) {
|
||||
this.segmenter = new Intl.Segmenter(locale, {
|
||||
localeMatcher: 'lookup',
|
||||
granularity: 'word',
|
||||
})
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
// ignore, not supported for some reason
|
||||
debugConsole.error(error)
|
||||
}
|
||||
|
||||
if (this.segmenter) {
|
||||
debugConsole.log(`Using Intl.Segmenter for ${locale}`)
|
||||
} else {
|
||||
debugConsole.warn(`Not using Intl.Segmenter for ${locale}`)
|
||||
}
|
||||
}
|
||||
|
||||
destroy() {
|
||||
|
@ -257,7 +281,13 @@ export class SpellChecker {
|
|||
for (const i of changedLineNumbers) {
|
||||
const line = view.state.doc.line(i)
|
||||
wordsToCheck.push(
|
||||
...getWordsFromLine(view, line, ignoredWords, this.language)
|
||||
...getWordsFromLine(
|
||||
view,
|
||||
line,
|
||||
ignoredWords,
|
||||
this.language,
|
||||
this.segmenter
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
|
@ -347,35 +377,48 @@ export const compileEffects = (results: {
|
|||
]
|
||||
}
|
||||
|
||||
export const getWordsFromLine = (
|
||||
export function* getWordsFromLine(
|
||||
view: EditorView,
|
||||
line: Line,
|
||||
ignoredWords: IgnoredWords | null,
|
||||
lang: string
|
||||
): Word[] => {
|
||||
const normalTextSpans: Array<NormalTextSpan> = getNormalTextSpansFromLine(
|
||||
view,
|
||||
line
|
||||
)
|
||||
const words: Word[] = []
|
||||
for (const span of normalTextSpans) {
|
||||
for (const match of span.text.matchAll(WORD_REGEX)) {
|
||||
const word = match[0].replace(/^'+/, '').replace(/'+$/, '')
|
||||
if (!ignoredWords?.has(word)) {
|
||||
const from = span.from + match.index
|
||||
words.push(
|
||||
new Word({
|
||||
lang: string,
|
||||
segmenter?: Intl.Segmenter
|
||||
) {
|
||||
for (const span of getNormalTextSpansFromLine(view, line)) {
|
||||
if (segmenter) {
|
||||
for (const value of segmenter.segment(span.text)) {
|
||||
if (value.isWordLike && !ignoredWords?.has(value.segment)) {
|
||||
const word = value.segment
|
||||
const from = span.from + value.index
|
||||
yield new Word({
|
||||
text: word,
|
||||
from,
|
||||
to: from + word.length,
|
||||
lineNumber: line.number,
|
||||
lang,
|
||||
})
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (const match of span.text.matchAll(WORD_REGEX)) {
|
||||
let word = match[0].replace(/'+$/, '')
|
||||
let from = span.from + match.index
|
||||
while (word.startsWith("'")) {
|
||||
word = word.slice(1)
|
||||
from++
|
||||
}
|
||||
if (!ignoredWords?.has(word)) {
|
||||
yield new Word({
|
||||
text: word,
|
||||
from,
|
||||
to: from + word.length,
|
||||
lineNumber: line.number,
|
||||
lang,
|
||||
})
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
return words
|
||||
}
|
||||
}
|
||||
|
||||
export type Mark = Range<RangeValue & { spec: { word: Word } }>
|
||||
|
|
|
@ -26,7 +26,7 @@ describe('SpellChecker', function () {
|
|||
extensions,
|
||||
})
|
||||
const line = view.state.doc.line(1)
|
||||
const words = getWordsFromLine(view, line, ignoredWords, lang)
|
||||
const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang))
|
||||
expect(words).to.deep.equal([
|
||||
{ text: 'Hello', from: 0, to: 5, lineNumber: 1, lang: 'en' },
|
||||
{ text: 'test', from: 6, to: 10, lineNumber: 1, lang: 'en' },
|
||||
|
@ -42,7 +42,7 @@ describe('SpellChecker', function () {
|
|||
extensions,
|
||||
})
|
||||
const line = view.state.doc.line(1)
|
||||
const words = getWordsFromLine(view, line, ignoredWords, lang)
|
||||
const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang))
|
||||
expect(words).to.deep.equal([
|
||||
{ text: 'Hello', from: 0, to: 5, lineNumber: 1, lang: 'en' },
|
||||
{ text: 'one', from: 11, to: 14, lineNumber: 1, lang: 'en' },
|
||||
|
@ -56,7 +56,7 @@ describe('SpellChecker', function () {
|
|||
extensions,
|
||||
})
|
||||
const line = view.state.doc.line(1)
|
||||
const words = getWordsFromLine(view, line, ignoredWords, lang)
|
||||
const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang))
|
||||
expect(words).to.deep.equal([])
|
||||
})
|
||||
|
||||
|
@ -66,7 +66,7 @@ describe('SpellChecker', function () {
|
|||
extensions,
|
||||
})
|
||||
const line = view.state.doc.line(1)
|
||||
const words = getWordsFromLine(view, line, ignoredWords, lang)
|
||||
const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang))
|
||||
expect(words).to.deep.equal([
|
||||
{ text: 'seven', from: 24, to: 29, lineNumber: 1, lang: 'en' },
|
||||
{ text: 'eight', from: 30, to: 35, lineNumber: 1, lang: 'en' },
|
||||
|
@ -79,7 +79,7 @@ describe('SpellChecker', function () {
|
|||
extensions,
|
||||
})
|
||||
const line = view.state.doc.line(1)
|
||||
const words = getWordsFromLine(view, line, ignoredWords, lang)
|
||||
const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang))
|
||||
expect(words).to.deep.equal([
|
||||
{ text: 'nine', from: 5, to: 9, lineNumber: 1, lang: 'en' },
|
||||
{ text: 'ten', from: 15, to: 18, lineNumber: 1, lang: 'en' },
|
||||
|
|
Loading…
Reference in a new issue