Improve handling of whitespace in pasted HTML (#15074)

GitOrigin-RevId: 48876707e15e1ccd1bb71ce01121033d0b0dbeaf
This commit is contained in:
Alf Eaton 2023-10-09 09:52:41 +01:00 committed by Copybot
parent 1e85736f69
commit 31033224d5
4 changed files with 282 additions and 61 deletions

View file

@ -0,0 +1,107 @@
// elements which should contain only block elements
const blockContainingElements = new Set([
'DL',
'FIELDSET',
'FIGURE',
'HEAD',
'OL',
'TABLE',
'TBODY',
'TFOOT',
'THEAD',
'TR',
'UL',
])
export const isBlockContainingElement = (node: Node): node is HTMLElement =>
blockContainingElements.has(node.nodeName)
// elements which are block elements (as opposed to inline elements)
const blockElements = new Set([
'ADDRESS',
'ARTICLE',
'ASIDE',
'BLOCKQUOTE',
'BODY',
'CANVAS',
'DD',
'DIV',
'DL',
'DT',
'FIELDSET',
'FIGCAPTION',
'FIGURE',
'FOOTER',
'FORM',
'H1',
'H2',
'H3',
'H4',
'H5',
'H6',
'HEADER',
'HGROUP',
'HR',
'LI',
'MAIN',
'NAV',
'NOSCRIPT',
'OL',
'P',
'PRE',
'SECTION',
'TABLE',
'TBODY',
'TD',
'TFOOT',
'TH',
'THEAD',
'TR',
'UL',
'VIDEO',
])
export const isBlockElement = (node: Node): node is HTMLElement =>
blockElements.has(node.nodeName)
const inlineElements = new Set([
'A',
'ABBR',
'ACRONYM',
'B',
'BIG',
'CITE',
'DEL',
'EM',
'I',
'INS',
'SMALL',
'SPAN',
'STRONG',
'SUB',
'SUP',
'TEXTAREA', // TODO
'TIME',
'TT',
])
export const isInlineElement = (node: Node): node is HTMLElement =>
inlineElements.has(node.nodeName)
const codeElements = new Set(['CODE', 'PRE'])
export const isCodeElement = (node: Node): node is HTMLElement =>
codeElements.has(node.nodeName)
const keepEmptyBlockElements = new Set(['TD', 'TH', 'CANVAS', 'DT', 'DD', 'HR'])
export const shouldRemoveEmptyBlockElement = (
node: Node
): node is HTMLElement =>
!keepEmptyBlockElements.has(node.nodeName) && !node.hasChildNodes()
export const isTextNode = (node: Node): node is Text =>
node.nodeType === Node.TEXT_NODE
export const isElementNode = (node: Node): node is HTMLElement =>
node.nodeType === Node.ELEMENT_NODE

View file

@ -6,6 +6,14 @@ import {
storePastedContent,
} from './pasted-content'
import { debugConsole } from '@/utils/debugging'
import {
isBlockContainingElement,
isBlockElement,
isElementNode,
isInlineElement,
isTextNode,
shouldRemoveEmptyBlockElement,
} from './html-elements'
export const pasteHtml = [
Prec.highest(
@ -49,12 +57,18 @@ export const pasteHtml = [
return false
}
// if the only content is in a code block, use the plain text version
if (onlyCode(documentElement)) {
const bodyElement = documentElement.querySelector('body')
// DOMParser should always create a body element, so this is mostly for TypeScript
if (!bodyElement) {
return false
}
const latex = htmlToLaTeX(documentElement)
// if the only content is in a code block, use the plain text version
if (onlyCode(bodyElement)) {
return false
}
const latex = htmlToLaTeX(bodyElement)
// if there's no formatting, use the plain text version
if (latex === text && clipboardData.files.length === 0) {
@ -121,25 +135,38 @@ const hasProgId = (documentElement: HTMLElement) => {
return meta && meta.content.trim().length > 0
}
const htmlToLaTeX = (documentElement: HTMLElement) => {
const htmlToLaTeX = (bodyElement: HTMLElement) => {
// remove style elements
removeUnwantedElements(documentElement, 'style')
removeUnwantedElements(bodyElement, 'style')
// replace non-breaking spaces added by Chrome on copy
processWhitespace(documentElement)
let before: string | null = null
let after: string | null = null
// repeat until the content stabilises
do {
before = bodyElement.textContent
// normalise whitespace in text
normaliseWhitespace(bodyElement)
// replace unwanted whitespace in blocks
processWhitespaceInBlocks(bodyElement)
after = bodyElement.textContent
} while (before !== after)
// pre-process table elements
processTables(documentElement)
processTables(bodyElement)
// pre-process lists
processLists(documentElement)
processLists(bodyElement)
// protect special characters in non-LaTeX text nodes
protectSpecialCharacters(documentElement)
protectSpecialCharacters(bodyElement)
processMatchedElements(documentElement)
processMatchedElements(bodyElement)
const text = documentElement.textContent
const text = bodyElement.textContent
if (!text) {
return ''
@ -151,24 +178,102 @@ const htmlToLaTeX = (documentElement: HTMLElement) => {
.replaceAll('', '')
// normalise multiple newlines
.replaceAll(/\n{2,}/g, '\n\n')
// only allow a single newline at the start and end
.replaceAll(/(^\n+|\n+$)/g, '\n')
// replace tab with 4 spaces (hard-coded indent unit)
.replaceAll('\t', ' ')
)
}
const processWhitespace = (documentElement: HTMLElement) => {
const trimInlineElements = (
element: HTMLElement,
precedingSpace = true
): boolean => {
for (const node of element.childNodes) {
if (isTextNode(node)) {
let text = node.textContent!
if (precedingSpace) {
text = text.replace(/^\s+/, '')
}
if (text === '') {
node.remove()
} else {
node.textContent = text
precedingSpace = /\s$/.test(text)
}
} else if (isInlineElement(node)) {
precedingSpace = trimInlineElements(node, precedingSpace)
} else if (isBlockElement(node)) {
precedingSpace = true // TODO
} else {
precedingSpace = false // TODO
}
}
// TODO: trim whitespace at the end
return precedingSpace
}
const processWhitespaceInBlocks = (documentElement: HTMLElement) => {
trimInlineElements(documentElement)
const walker = document.createTreeWalker(
documentElement,
NodeFilter.SHOW_TEXT
NodeFilter.SHOW_ELEMENT,
node =>
isElementNode(node) && isElementContainingCode(node)
? NodeFilter.FILTER_REJECT
: NodeFilter.FILTER_ACCEPT
)
for (let node = walker.nextNode(); node; node = walker.nextNode()) {
if (node.textContent === ' ') {
node.textContent = ' '
// TODO: remove leading newline from pre, code and textarea?
if (isBlockContainingElement(node)) {
// remove all text nodes directly inside elements that should only contain blocks
for (const childNode of node.childNodes) {
if (isTextNode(childNode)) {
childNode.remove()
}
}
}
const isElementNode = (node: Node): node is HTMLElement =>
node.nodeType === Node.ELEMENT_NODE
if (isBlockElement(node)) {
trimInlineElements(node)
if (shouldRemoveEmptyBlockElement(node)) {
node.remove()
// TODO: and parents?
}
}
}
}
const normaliseWhitespace = (documentElement: HTMLElement) => {
const walker = document.createTreeWalker(
documentElement,
NodeFilter.SHOW_TEXT,
node =>
isElementNode(node) && isElementContainingCode(node)
? NodeFilter.FILTER_REJECT
: NodeFilter.FILTER_ACCEPT
)
for (let node = walker.nextNode(); node; node = walker.nextNode()) {
const text = node.textContent
if (text !== null) {
if (/^\s+$/.test(text)) {
// replace nodes containing only whitespace (including non-breaking space) with a single space
node.textContent = ' '
} else {
// collapse contiguous whitespace (except for non-breaking space) to a single space
node.textContent = text.replaceAll(/[\n\r\f\t \u2028\u2029]+/g, ' ')
}
}
}
}
// TODO: negative lookbehind once Safari supports it
const specialCharacterRegExp = /(^|[^\\])([#$%&~_^\\{}])/g
@ -187,8 +292,8 @@ const specialCharacterReplacer = (
}
const isElementContainingCode = (element: HTMLElement) =>
element.tagName === 'CODE' ||
(element.tagName === 'PRE' && element.style.fontFamily.includes('monospace'))
element.nodeName === 'CODE' ||
(element.nodeName === 'PRE' && element.style.fontFamily.includes('monospace'))
const protectSpecialCharacters = (documentElement: HTMLElement) => {
const walker = document.createTreeWalker(
@ -201,7 +306,7 @@ const protectSpecialCharacters = (documentElement: HTMLElement) => {
)
for (let node = walker.nextNode(); node; node = walker.nextNode()) {
if (node.nodeType === Node.TEXT_NODE) {
if (isTextNode(node)) {
const text = node.textContent
if (text) {
// replace non-backslash-prefixed characters
@ -289,34 +394,8 @@ const processLists = (element: HTMLElement) => {
}
}
const removeNonContentTextNodes = (table: HTMLTableElement) => {
// remove text nodes that are direct children of non-content table elements
const containers = table.querySelectorAll('thead,tbody,tr')
for (const element of [table, ...containers]) {
for (const childNode of element.childNodes) {
if (childNode.nodeType === Node.TEXT_NODE) {
element.removeChild(childNode)
}
}
}
// remove whitespace-only text nodes at the start or end of table cells
for (const element of table.querySelectorAll('th,td')) {
for (const childNode of [element.firstChild, element.lastChild]) {
if (
childNode?.nodeType === Node.TEXT_NODE &&
childNode.textContent?.trim() === ''
) {
element.removeChild(childNode)
}
}
}
}
const processTables = (element: HTMLElement) => {
for (const table of element.querySelectorAll('table')) {
removeNonContentTextNodes(table)
// create a wrapper element for the table and the caption
const container = document.createElement('div')
container.className = 'ol-table-wrap'
@ -380,7 +459,7 @@ const processTables = (element: HTMLElement) => {
}
const isTableRow = (element: Element | null): element is HTMLTableRowElement =>
element?.tagName === 'TR'
element?.nodeName === 'TR'
const cellAlignment = new Map([
['left', 'l'],
@ -506,7 +585,7 @@ const rowHasBorderStyle = (
const isTableRowElement = (
element: Element | null
): element is HTMLTableRowElement => element?.tagName === 'TR'
): element is HTMLTableRowElement => element?.nodeName === 'TR'
const nextRowHasBorderStyle = (
element: HTMLTableRowElement,
@ -645,7 +724,7 @@ const selectors = [
// TODO: h6?
createSelector({
selector: 'br',
match: element => element.parentElement?.nodeName !== 'TD', // TODO: why?
match: element => !element.closest('table'),
start: () => `\n\n`,
}),
createSelector({

View file

@ -66,10 +66,6 @@ export const Latex = (args: any, { globals: { theme } }: any) => {
useMeta({
'ol-showSymbolPalette': true,
'ol-splitTestVariants': {
'figure-modal': 'enabled',
'table-generator': 'enabled',
},
})
return <SourceEditor />

View file

@ -387,7 +387,7 @@ describe('<CodeMirrorEditor/> paste HTML in Visual mode', function () {
cy.get('@content').trigger('paste', { clipboardData })
cy.get('@content').should('have.text', 'test foobarbaztest')
cy.get('.cm-line').should('have.length', 8)
cy.get('.cm-line').should('have.length', 7)
})
it('handles pasted paragraphs in list items and table cells', function () {
@ -410,7 +410,7 @@ describe('<CodeMirrorEditor/> paste HTML in Visual mode', function () {
'have.text',
'test foobarbaz foo foo foo foofootest'
)
cy.get('.cm-line').should('have.length', 15)
cy.get('.cm-line').should('have.length', 14)
})
it('handles pasted inline code', function () {
@ -662,6 +662,45 @@ describe('<CodeMirrorEditor/> paste HTML in Visual mode', function () {
cy.get('.cm-line').should('have.length', 8)
})
it('tidies whitespace in pasted lists', function () {
mountEditor()
const data = `<ul>
<li> foo </li>
<li>
<p>
<b>test</b></p>
<p>test test test
test test
test test test</p>
</li>
</ul>`
const clipboardData = new DataTransfer()
clipboardData.setData('text/html', data)
cy.get('@content').trigger('paste', { clipboardData })
cy.get('.cm-line').should('have.length', 6)
cy.get('@content').should(
'have.text',
' foo testtest test test test test test test test'
)
})
it('collapses whitespace in adjacent inline elements', function () {
mountEditor()
const data = `<p><b> foo </b><span> test </span><i> bar </i> baz</p>`
const clipboardData = new DataTransfer()
clipboardData.setData('text/html', data)
cy.get('@content').trigger('paste', { clipboardData })
cy.get('@content').should('have.text', 'foo test bar baz')
})
it('treats a pasted image as a figure even if there is HTML', function () {
mountEditor()