Use Intl.Segmenter when available for identifying words to spell check (#21225)

GitOrigin-RevId: f261ae331e0b1f87f4e88a512b25c8798f0b02c6
This commit is contained in:
Alf Eaton 2024-10-21 10:58:49 +01:00 committed by Copybot
parent 04dbb7d2f2
commit 6c4fac68ca
2 changed files with 70 additions and 27 deletions

View file

@ -7,10 +7,7 @@ import { spellCheckRequest } from './backend'
import { EditorView, ViewUpdate } from '@codemirror/view'
import { ChangeSet, Line, Range, RangeValue } from '@codemirror/state'
import { IgnoredWords } from '../../../dictionary/ignored-words'
import {
getNormalTextSpansFromLine,
NormalTextSpan,
} from '../../utils/tree-query'
import { getNormalTextSpansFromLine } from '../../utils/tree-query'
import { waitForParser } from '../wait-for-parser'
import { debugConsole } from '@/utils/debugging'
import type { HunspellManager } from '../../hunspell/HunspellManager'
@ -26,6 +23,7 @@ export class SpellChecker {
private waitingForParser = false
private firstCheckPending = false
private trackedChanges: ChangeSet
private readonly segmenter?: Intl.Segmenter
// eslint-disable-next-line no-useless-constructor
constructor(
@ -34,6 +32,32 @@ export class SpellChecker {
) {
debugConsole.log('SpellChecker', language, hunspellManager)
this.trackedChanges = ChangeSet.empty(0)
const locale = language.replace(/_/, '-')
try {
if (Intl.Segmenter) {
const supportedLocales = Intl.Segmenter.supportedLocalesOf([locale], {
localeMatcher: 'lookup',
})
if (supportedLocales.includes(locale)) {
this.segmenter = new Intl.Segmenter(locale, {
localeMatcher: 'lookup',
granularity: 'word',
})
}
}
} catch (error) {
// ignore, not supported for some reason
debugConsole.error(error)
}
if (this.segmenter) {
debugConsole.log(`Using Intl.Segmenter for ${locale}`)
} else {
debugConsole.warn(`Not using Intl.Segmenter for ${locale}`)
}
}
destroy() {
@ -257,7 +281,13 @@ export class SpellChecker {
for (const i of changedLineNumbers) {
const line = view.state.doc.line(i)
wordsToCheck.push(
...getWordsFromLine(view, line, ignoredWords, this.language)
...getWordsFromLine(
view,
line,
ignoredWords,
this.language,
this.segmenter
)
)
}
@ -347,35 +377,48 @@ export const compileEffects = (results: {
]
}
export const getWordsFromLine = (
export function* getWordsFromLine(
view: EditorView,
line: Line,
ignoredWords: IgnoredWords | null,
lang: string
): Word[] => {
const normalTextSpans: Array<NormalTextSpan> = getNormalTextSpansFromLine(
view,
line
)
const words: Word[] = []
for (const span of normalTextSpans) {
for (const match of span.text.matchAll(WORD_REGEX)) {
const word = match[0].replace(/^'+/, '').replace(/'+$/, '')
if (!ignoredWords?.has(word)) {
const from = span.from + match.index
words.push(
new Word({
lang: string,
segmenter?: Intl.Segmenter
) {
for (const span of getNormalTextSpansFromLine(view, line)) {
if (segmenter) {
for (const value of segmenter.segment(span.text)) {
if (value.isWordLike && !ignoredWords?.has(value.segment)) {
const word = value.segment
const from = span.from + value.index
yield new Word({
text: word,
from,
to: from + word.length,
lineNumber: line.number,
lang,
})
)
}
}
} else {
for (const match of span.text.matchAll(WORD_REGEX)) {
let word = match[0].replace(/'+$/, '')
let from = span.from + match.index
while (word.startsWith("'")) {
word = word.slice(1)
from++
}
if (!ignoredWords?.has(word)) {
yield new Word({
text: word,
from,
to: from + word.length,
lineNumber: line.number,
lang,
})
}
}
}
}
return words
}
export type Mark = Range<RangeValue & { spec: { word: Word } }>

View file

@ -26,7 +26,7 @@ describe('SpellChecker', function () {
extensions,
})
const line = view.state.doc.line(1)
const words = getWordsFromLine(view, line, ignoredWords, lang)
const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang))
expect(words).to.deep.equal([
{ text: 'Hello', from: 0, to: 5, lineNumber: 1, lang: 'en' },
{ text: 'test', from: 6, to: 10, lineNumber: 1, lang: 'en' },
@ -42,7 +42,7 @@ describe('SpellChecker', function () {
extensions,
})
const line = view.state.doc.line(1)
const words = getWordsFromLine(view, line, ignoredWords, lang)
const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang))
expect(words).to.deep.equal([
{ text: 'Hello', from: 0, to: 5, lineNumber: 1, lang: 'en' },
{ text: 'one', from: 11, to: 14, lineNumber: 1, lang: 'en' },
@ -56,7 +56,7 @@ describe('SpellChecker', function () {
extensions,
})
const line = view.state.doc.line(1)
const words = getWordsFromLine(view, line, ignoredWords, lang)
const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang))
expect(words).to.deep.equal([])
})
@ -66,7 +66,7 @@ describe('SpellChecker', function () {
extensions,
})
const line = view.state.doc.line(1)
const words = getWordsFromLine(view, line, ignoredWords, lang)
const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang))
expect(words).to.deep.equal([
{ text: 'seven', from: 24, to: 29, lineNumber: 1, lang: 'en' },
{ text: 'eight', from: 30, to: 35, lineNumber: 1, lang: 'en' },
@ -79,7 +79,7 @@ describe('SpellChecker', function () {
extensions,
})
const line = view.state.doc.line(1)
const words = getWordsFromLine(view, line, ignoredWords, lang)
const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang))
expect(words).to.deep.equal([
{ text: 'nine', from: 5, to: 9, lineNumber: 1, lang: 'en' },
{ text: 'ten', from: 15, to: 18, lineNumber: 1, lang: 'en' },