From 6c4fac68ca3ba6b3c4bbb180676fb9f5d10926a8 Mon Sep 17 00:00:00 2001 From: Alf Eaton Date: Mon, 21 Oct 2024 10:58:49 +0100 Subject: [PATCH] Use Intl.Segmenter when available for identifying words to spell check (#21225) GitOrigin-RevId: f261ae331e0b1f87f4e88a512b25c8798f0b02c6 --- .../extensions/spelling/spellchecker.ts | 87 ++++++++++++++----- .../extensions/spelling/spellchecker.test.ts | 10 +-- 2 files changed, 70 insertions(+), 27 deletions(-) diff --git a/services/web/frontend/js/features/source-editor/extensions/spelling/spellchecker.ts b/services/web/frontend/js/features/source-editor/extensions/spelling/spellchecker.ts index f27dea2445..72c83dc969 100644 --- a/services/web/frontend/js/features/source-editor/extensions/spelling/spellchecker.ts +++ b/services/web/frontend/js/features/source-editor/extensions/spelling/spellchecker.ts @@ -7,10 +7,7 @@ import { spellCheckRequest } from './backend' import { EditorView, ViewUpdate } from '@codemirror/view' import { ChangeSet, Line, Range, RangeValue } from '@codemirror/state' import { IgnoredWords } from '../../../dictionary/ignored-words' -import { - getNormalTextSpansFromLine, - NormalTextSpan, -} from '../../utils/tree-query' +import { getNormalTextSpansFromLine } from '../../utils/tree-query' import { waitForParser } from '../wait-for-parser' import { debugConsole } from '@/utils/debugging' import type { HunspellManager } from '../../hunspell/HunspellManager' @@ -26,6 +23,7 @@ export class SpellChecker { private waitingForParser = false private firstCheckPending = false private trackedChanges: ChangeSet + private readonly segmenter?: Intl.Segmenter // eslint-disable-next-line no-useless-constructor constructor( @@ -34,6 +32,32 @@ export class SpellChecker { ) { debugConsole.log('SpellChecker', language, hunspellManager) this.trackedChanges = ChangeSet.empty(0) + + const locale = language.replace(/_/, '-') + + try { + if (Intl.Segmenter) { + const supportedLocales = Intl.Segmenter.supportedLocalesOf([locale], { + localeMatcher: 'lookup', + }) + + if (supportedLocales.includes(locale)) { + this.segmenter = new Intl.Segmenter(locale, { + localeMatcher: 'lookup', + granularity: 'word', + }) + } + } + } catch (error) { + // ignore, not supported for some reason + debugConsole.error(error) + } + + if (this.segmenter) { + debugConsole.log(`Using Intl.Segmenter for ${locale}`) + } else { + debugConsole.warn(`Not using Intl.Segmenter for ${locale}`) + } } destroy() { @@ -257,7 +281,13 @@ export class SpellChecker { for (const i of changedLineNumbers) { const line = view.state.doc.line(i) wordsToCheck.push( - ...getWordsFromLine(view, line, ignoredWords, this.language) + ...getWordsFromLine( + view, + line, + ignoredWords, + this.language, + this.segmenter + ) ) } @@ -347,35 +377,48 @@ export const compileEffects = (results: { ] } -export const getWordsFromLine = ( +export function* getWordsFromLine( view: EditorView, line: Line, ignoredWords: IgnoredWords | null, - lang: string -): Word[] => { - const normalTextSpans: Array = getNormalTextSpansFromLine( - view, - line - ) - const words: Word[] = [] - for (const span of normalTextSpans) { - for (const match of span.text.matchAll(WORD_REGEX)) { - const word = match[0].replace(/^'+/, '').replace(/'+$/, '') - if (!ignoredWords?.has(word)) { - const from = span.from + match.index - words.push( - new Word({ + lang: string, + segmenter?: Intl.Segmenter +) { + for (const span of getNormalTextSpansFromLine(view, line)) { + if (segmenter) { + for (const value of segmenter.segment(span.text)) { + if (value.isWordLike && !ignoredWords?.has(value.segment)) { + const word = value.segment + const from = span.from + value.index + yield new Word({ text: word, from, to: from + word.length, lineNumber: line.number, lang, }) - ) + } + } + } else { + for (const match of span.text.matchAll(WORD_REGEX)) { + let word = match[0].replace(/'+$/, '') + let from = span.from + match.index + while (word.startsWith("'")) { + word = word.slice(1) + from++ + } + if (!ignoredWords?.has(word)) { + yield new Word({ + text: word, + from, + to: from + word.length, + lineNumber: line.number, + lang, + }) + } } } } - return words } export type Mark = Range diff --git a/services/web/test/frontend/features/source-editor/extensions/spelling/spellchecker.test.ts b/services/web/test/frontend/features/source-editor/extensions/spelling/spellchecker.test.ts index eff1d98d55..e3083d1c6f 100644 --- a/services/web/test/frontend/features/source-editor/extensions/spelling/spellchecker.test.ts +++ b/services/web/test/frontend/features/source-editor/extensions/spelling/spellchecker.test.ts @@ -26,7 +26,7 @@ describe('SpellChecker', function () { extensions, }) const line = view.state.doc.line(1) - const words = getWordsFromLine(view, line, ignoredWords, lang) + const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang)) expect(words).to.deep.equal([ { text: 'Hello', from: 0, to: 5, lineNumber: 1, lang: 'en' }, { text: 'test', from: 6, to: 10, lineNumber: 1, lang: 'en' }, @@ -42,7 +42,7 @@ describe('SpellChecker', function () { extensions, }) const line = view.state.doc.line(1) - const words = getWordsFromLine(view, line, ignoredWords, lang) + const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang)) expect(words).to.deep.equal([ { text: 'Hello', from: 0, to: 5, lineNumber: 1, lang: 'en' }, { text: 'one', from: 11, to: 14, lineNumber: 1, lang: 'en' }, @@ -56,7 +56,7 @@ describe('SpellChecker', function () { extensions, }) const line = view.state.doc.line(1) - const words = getWordsFromLine(view, line, ignoredWords, lang) + const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang)) expect(words).to.deep.equal([]) }) @@ -66,7 +66,7 @@ describe('SpellChecker', function () { extensions, }) const line = view.state.doc.line(1) - const words = getWordsFromLine(view, line, ignoredWords, lang) + const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang)) expect(words).to.deep.equal([ { text: 'seven', from: 24, to: 29, lineNumber: 1, lang: 'en' }, { text: 'eight', from: 30, to: 35, lineNumber: 1, lang: 'en' }, @@ -79,7 +79,7 @@ describe('SpellChecker', function () { extensions, }) const line = view.state.doc.line(1) - const words = getWordsFromLine(view, line, ignoredWords, lang) + const words = Array.from(getWordsFromLine(view, line, ignoredWords, lang)) expect(words).to.deep.equal([ { text: 'nine', from: 5, to: 9, lineNumber: 1, lang: 'en' }, { text: 'ten', from: 15, to: 18, lineNumber: 1, lang: 'en' },