Use Intl.Segmenter when available for identifying words to spell check (#21225)

GitOrigin-RevId: f261ae331e0b1f87f4e88a512b25c8798f0b02c6
This commit is contained in:
Alf Eaton 2024-10-21 10:58:49 +01:00 committed by Copybot
parent 04dbb7d2f2
commit 6c4fac68ca
2 changed files with 70 additions and 27 deletions

View file

@ -7,10 +7,7 @@ import { spellCheckRequest } from './backend'
import { EditorView, ViewUpdate } from '@codemirror/view' import { EditorView, ViewUpdate } from '@codemirror/view'
import { ChangeSet, Line, Range, RangeValue } from '@codemirror/state' import { ChangeSet, Line, Range, RangeValue } from '@codemirror/state'
import { IgnoredWords } from '../../../dictionary/ignored-words' import { IgnoredWords } from '../../../dictionary/ignored-words'
import { import { getNormalTextSpansFromLine } from '../../utils/tree-query'
getNormalTextSpansFromLine,
NormalTextSpan,
} from '../../utils/tree-query'
import { waitForParser } from '../wait-for-parser' import { waitForParser } from '../wait-for-parser'
import { debugConsole } from '@/utils/debugging' import { debugConsole } from '@/utils/debugging'
import type { HunspellManager } from '../../hunspell/HunspellManager' import type { HunspellManager } from '../../hunspell/HunspellManager'
@ -26,6 +23,7 @@ export class SpellChecker {
private waitingForParser = false private waitingForParser = false
private firstCheckPending = false private firstCheckPending = false
private trackedChanges: ChangeSet private trackedChanges: ChangeSet
private readonly segmenter?: Intl.Segmenter
// eslint-disable-next-line no-useless-constructor // eslint-disable-next-line no-useless-constructor
constructor( constructor(
@ -34,6 +32,32 @@ export class SpellChecker {
) { ) {
debugConsole.log('SpellChecker', language, hunspellManager) debugConsole.log('SpellChecker', language, hunspellManager)
this.trackedChanges = ChangeSet.empty(0) this.trackedChanges = ChangeSet.empty(0)
const locale = language.replace(/_/, '-')
try {
if (Intl.Segmenter) {
const supportedLocales = Intl.Segmenter.supportedLocalesOf([locale], {
localeMatcher: 'lookup',
})
if (supportedLocales.includes(locale)) {
this.segmenter = new Intl.Segmenter(locale, {
localeMatcher: 'lookup',
granularity: 'word',
})
}
}
} catch (error) {
// ignore, not supported for some reason
debugConsole.error(error)
}
if (this.segmenter) {
debugConsole.log(`Using Intl.Segmenter for ${locale}`)
} else {
debugConsole.warn(`Not using Intl.Segmenter for ${locale}`)
}
} }
destroy() { destroy() {
@ -257,7 +281,13 @@ export class SpellChecker {
for (const i of changedLineNumbers) { for (const i of changedLineNumbers) {
const line = view.state.doc.line(i) const line = view.state.doc.line(i)
wordsToCheck.push( wordsToCheck.push(
...getWordsFromLine(view, line, ignoredWords, this.language) ...getWordsFromLine(
view,
line,
ignoredWords,
this.language,
this.segmenter
)
) )
} }
@ -347,35 +377,48 @@ export const compileEffects = (results: {
] ]
} }
export const getWordsFromLine = ( export function* getWordsFromLine(
view: EditorView, view: EditorView,
line: Line, line: Line,
ignoredWords: IgnoredWords | null, ignoredWords: IgnoredWords | null,
lang: string lang: string,
): Word[] => { segmenter?: Intl.Segmenter
const normalTextSpans: Array<NormalTextSpan> = getNormalTextSpansFromLine( ) {
view, for (const span of getNormalTextSpansFromLine(view, line)) {
line if (segmenter) {
) for (const value of segmenter.segment(span.text)) {
const words: Word[] = [] if (value.isWordLike && !ignoredWords?.has(value.segment)) {
for (const span of normalTextSpans) { const word = value.segment
for (const match of span.text.matchAll(WORD_REGEX)) { const from = span.from + value.index
const word = match[0].replace(/^'+/, '').replace(/'+$/, '') yield new Word({
if (!ignoredWords?.has(word)) {
const from = span.from + match.index
words.push(
new Word({
text: word, text: word,
from, from,
to: from + word.length, to: from + word.length,
lineNumber: line.number, lineNumber: line.number,
lang, lang,
}) })
) }
}
} else {
for (const match of span.text.matchAll(WORD_REGEX)) {
let word = match[0].replace(/'+$/, '')
let from = span.from + match.index
while (word.startsWith("'")) {
word = word.slice(1)
from++
}
if (!ignoredWords?.has(word)) {
yield new Word({
text: word,
from,
to: from + word.length,
lineNumber: line.number,
lang,
})
}
} }
} }
} }
return words
} }
export type Mark = Range<RangeValue & { spec: { word: Word } }> export type Mark = Range<RangeValue & { spec: { word: Word } }>

View file

@ -26,7 +26,7 @@ describe('SpellChecker', function () {
extensions, extensions,
}) })
const line = view.state.doc.line(1) const line = view.state.doc.line(1)
const words = getWordsFromLine(view, line, ignoredWords, lang) const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang))
expect(words).to.deep.equal([ expect(words).to.deep.equal([
{ text: 'Hello', from: 0, to: 5, lineNumber: 1, lang: 'en' }, { text: 'Hello', from: 0, to: 5, lineNumber: 1, lang: 'en' },
{ text: 'test', from: 6, to: 10, lineNumber: 1, lang: 'en' }, { text: 'test', from: 6, to: 10, lineNumber: 1, lang: 'en' },
@ -42,7 +42,7 @@ describe('SpellChecker', function () {
extensions, extensions,
}) })
const line = view.state.doc.line(1) const line = view.state.doc.line(1)
const words = getWordsFromLine(view, line, ignoredWords, lang) const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang))
expect(words).to.deep.equal([ expect(words).to.deep.equal([
{ text: 'Hello', from: 0, to: 5, lineNumber: 1, lang: 'en' }, { text: 'Hello', from: 0, to: 5, lineNumber: 1, lang: 'en' },
{ text: 'one', from: 11, to: 14, lineNumber: 1, lang: 'en' }, { text: 'one', from: 11, to: 14, lineNumber: 1, lang: 'en' },
@ -56,7 +56,7 @@ describe('SpellChecker', function () {
extensions, extensions,
}) })
const line = view.state.doc.line(1) const line = view.state.doc.line(1)
const words = getWordsFromLine(view, line, ignoredWords, lang) const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang))
expect(words).to.deep.equal([]) expect(words).to.deep.equal([])
}) })
@ -66,7 +66,7 @@ describe('SpellChecker', function () {
extensions, extensions,
}) })
const line = view.state.doc.line(1) const line = view.state.doc.line(1)
const words = getWordsFromLine(view, line, ignoredWords, lang) const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang))
expect(words).to.deep.equal([ expect(words).to.deep.equal([
{ text: 'seven', from: 24, to: 29, lineNumber: 1, lang: 'en' }, { text: 'seven', from: 24, to: 29, lineNumber: 1, lang: 'en' },
{ text: 'eight', from: 30, to: 35, lineNumber: 1, lang: 'en' }, { text: 'eight', from: 30, to: 35, lineNumber: 1, lang: 'en' },
@ -79,7 +79,7 @@ describe('SpellChecker', function () {
extensions, extensions,
}) })
const line = view.state.doc.line(1) const line = view.state.doc.line(1)
const words = getWordsFromLine(view, line, ignoredWords, lang) const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang))
expect(words).to.deep.equal([ expect(words).to.deep.equal([
{ text: 'nine', from: 5, to: 9, lineNumber: 1, lang: 'en' }, { text: 'nine', from: 5, to: 9, lineNumber: 1, lang: 'en' },
{ text: 'ten', from: 15, to: 18, lineNumber: 1, lang: 'en' }, { text: 'ten', from: 15, to: 18, lineNumber: 1, lang: 'en' },