mirror of
https://github.com/overleaf/overleaf.git
synced 2024-11-29 02:43:40 -05:00
Improve handling of whitespace in pasted HTML (#15074)
GitOrigin-RevId: 48876707e15e1ccd1bb71ce01121033d0b0dbeaf
This commit is contained in:
parent
1e85736f69
commit
31033224d5
4 changed files with 282 additions and 61 deletions
|
@ -0,0 +1,107 @@
|
|||
// elements which should contain only block elements
|
||||
const blockContainingElements = new Set([
|
||||
'DL',
|
||||
'FIELDSET',
|
||||
'FIGURE',
|
||||
'HEAD',
|
||||
'OL',
|
||||
'TABLE',
|
||||
'TBODY',
|
||||
'TFOOT',
|
||||
'THEAD',
|
||||
'TR',
|
||||
'UL',
|
||||
])
|
||||
|
||||
export const isBlockContainingElement = (node: Node): node is HTMLElement =>
|
||||
blockContainingElements.has(node.nodeName)
|
||||
|
||||
// elements which are block elements (as opposed to inline elements)
|
||||
const blockElements = new Set([
|
||||
'ADDRESS',
|
||||
'ARTICLE',
|
||||
'ASIDE',
|
||||
'BLOCKQUOTE',
|
||||
'BODY',
|
||||
'CANVAS',
|
||||
'DD',
|
||||
'DIV',
|
||||
'DL',
|
||||
'DT',
|
||||
'FIELDSET',
|
||||
'FIGCAPTION',
|
||||
'FIGURE',
|
||||
'FOOTER',
|
||||
'FORM',
|
||||
'H1',
|
||||
'H2',
|
||||
'H3',
|
||||
'H4',
|
||||
'H5',
|
||||
'H6',
|
||||
'HEADER',
|
||||
'HGROUP',
|
||||
'HR',
|
||||
'LI',
|
||||
'MAIN',
|
||||
'NAV',
|
||||
'NOSCRIPT',
|
||||
'OL',
|
||||
'P',
|
||||
'PRE',
|
||||
'SECTION',
|
||||
'TABLE',
|
||||
'TBODY',
|
||||
'TD',
|
||||
'TFOOT',
|
||||
'TH',
|
||||
'THEAD',
|
||||
'TR',
|
||||
'UL',
|
||||
'VIDEO',
|
||||
])
|
||||
|
||||
export const isBlockElement = (node: Node): node is HTMLElement =>
|
||||
blockElements.has(node.nodeName)
|
||||
|
||||
const inlineElements = new Set([
|
||||
'A',
|
||||
'ABBR',
|
||||
'ACRONYM',
|
||||
'B',
|
||||
'BIG',
|
||||
'CITE',
|
||||
'DEL',
|
||||
'EM',
|
||||
'I',
|
||||
'INS',
|
||||
'SMALL',
|
||||
'SPAN',
|
||||
'STRONG',
|
||||
'SUB',
|
||||
'SUP',
|
||||
'TEXTAREA', // TODO
|
||||
'TIME',
|
||||
'TT',
|
||||
])
|
||||
|
||||
export const isInlineElement = (node: Node): node is HTMLElement =>
|
||||
inlineElements.has(node.nodeName)
|
||||
|
||||
const codeElements = new Set(['CODE', 'PRE'])
|
||||
|
||||
export const isCodeElement = (node: Node): node is HTMLElement =>
|
||||
codeElements.has(node.nodeName)
|
||||
|
||||
const keepEmptyBlockElements = new Set(['TD', 'TH', 'CANVAS', 'DT', 'DD', 'HR'])
|
||||
|
||||
export const shouldRemoveEmptyBlockElement = (
|
||||
node: Node
|
||||
): node is HTMLElement =>
|
||||
!keepEmptyBlockElements.has(node.nodeName) && !node.hasChildNodes()
|
||||
|
||||
export const isTextNode = (node: Node): node is Text =>
|
||||
node.nodeType === Node.TEXT_NODE
|
||||
|
||||
export const isElementNode = (node: Node): node is HTMLElement =>
|
||||
node.nodeType === Node.ELEMENT_NODE
|
|
@ -6,6 +6,14 @@ import {
|
|||
storePastedContent,
|
||||
} from './pasted-content'
|
||||
import { debugConsole } from '@/utils/debugging'
|
||||
import {
|
||||
isBlockContainingElement,
|
||||
isBlockElement,
|
||||
isElementNode,
|
||||
isInlineElement,
|
||||
isTextNode,
|
||||
shouldRemoveEmptyBlockElement,
|
||||
} from './html-elements'
|
||||
|
||||
export const pasteHtml = [
|
||||
Prec.highest(
|
||||
|
@ -49,12 +57,18 @@ export const pasteHtml = [
|
|||
return false
|
||||
}
|
||||
|
||||
// if the only content is in a code block, use the plain text version
|
||||
if (onlyCode(documentElement)) {
|
||||
const bodyElement = documentElement.querySelector('body')
|
||||
// DOMParser should always create a body element, so this is mostly for TypeScript
|
||||
if (!bodyElement) {
|
||||
return false
|
||||
}
|
||||
|
||||
const latex = htmlToLaTeX(documentElement)
|
||||
// if the only content is in a code block, use the plain text version
|
||||
if (onlyCode(bodyElement)) {
|
||||
return false
|
||||
}
|
||||
|
||||
const latex = htmlToLaTeX(bodyElement)
|
||||
|
||||
// if there's no formatting, use the plain text version
|
||||
if (latex === text && clipboardData.files.length === 0) {
|
||||
|
@ -121,25 +135,38 @@ const hasProgId = (documentElement: HTMLElement) => {
|
|||
return meta && meta.content.trim().length > 0
|
||||
}
|
||||
|
||||
const htmlToLaTeX = (documentElement: HTMLElement) => {
|
||||
const htmlToLaTeX = (bodyElement: HTMLElement) => {
|
||||
// remove style elements
|
||||
removeUnwantedElements(documentElement, 'style')
|
||||
removeUnwantedElements(bodyElement, 'style')
|
||||
|
||||
// replace non-breaking spaces added by Chrome on copy
|
||||
processWhitespace(documentElement)
|
||||
let before: string | null = null
|
||||
let after: string | null = null
|
||||
|
||||
// repeat until the content stabilises
|
||||
do {
|
||||
before = bodyElement.textContent
|
||||
|
||||
// normalise whitespace in text
|
||||
normaliseWhitespace(bodyElement)
|
||||
|
||||
// replace unwanted whitespace in blocks
|
||||
processWhitespaceInBlocks(bodyElement)
|
||||
|
||||
after = bodyElement.textContent
|
||||
} while (before !== after)
|
||||
|
||||
// pre-process table elements
|
||||
processTables(documentElement)
|
||||
processTables(bodyElement)
|
||||
|
||||
// pre-process lists
|
||||
processLists(documentElement)
|
||||
processLists(bodyElement)
|
||||
|
||||
// protect special characters in non-LaTeX text nodes
|
||||
protectSpecialCharacters(documentElement)
|
||||
protectSpecialCharacters(bodyElement)
|
||||
|
||||
processMatchedElements(documentElement)
|
||||
processMatchedElements(bodyElement)
|
||||
|
||||
const text = documentElement.textContent
|
||||
const text = bodyElement.textContent
|
||||
|
||||
if (!text) {
|
||||
return ''
|
||||
|
@ -151,24 +178,102 @@ const htmlToLaTeX = (documentElement: HTMLElement) => {
|
|||
.replaceAll('', '')
|
||||
// normalise multiple newlines
|
||||
.replaceAll(/\n{2,}/g, '\n\n')
|
||||
// only allow a single newline at the start and end
|
||||
.replaceAll(/(^\n+|\n+$)/g, '\n')
|
||||
// replace tab with 4 spaces (hard-coded indent unit)
|
||||
.replaceAll('\t', ' ')
|
||||
)
|
||||
}
|
||||
|
||||
const processWhitespace = (documentElement: HTMLElement) => {
|
||||
const trimInlineElements = (
|
||||
element: HTMLElement,
|
||||
precedingSpace = true
|
||||
): boolean => {
|
||||
for (const node of element.childNodes) {
|
||||
if (isTextNode(node)) {
|
||||
let text = node.textContent!
|
||||
|
||||
if (precedingSpace) {
|
||||
text = text.replace(/^\s+/, '')
|
||||
}
|
||||
|
||||
if (text === '') {
|
||||
node.remove()
|
||||
} else {
|
||||
node.textContent = text
|
||||
precedingSpace = /\s$/.test(text)
|
||||
}
|
||||
} else if (isInlineElement(node)) {
|
||||
precedingSpace = trimInlineElements(node, precedingSpace)
|
||||
} else if (isBlockElement(node)) {
|
||||
precedingSpace = true // TODO
|
||||
} else {
|
||||
precedingSpace = false // TODO
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: trim whitespace at the end
|
||||
|
||||
return precedingSpace
|
||||
}
|
||||
|
||||
const processWhitespaceInBlocks = (documentElement: HTMLElement) => {
|
||||
trimInlineElements(documentElement)
|
||||
|
||||
const walker = document.createTreeWalker(
|
||||
documentElement,
|
||||
NodeFilter.SHOW_TEXT
|
||||
NodeFilter.SHOW_ELEMENT,
|
||||
node =>
|
||||
isElementNode(node) && isElementContainingCode(node)
|
||||
? NodeFilter.FILTER_REJECT
|
||||
: NodeFilter.FILTER_ACCEPT
|
||||
)
|
||||
|
||||
for (let node = walker.nextNode(); node; node = walker.nextNode()) {
|
||||
if (node.textContent === ' ') {
|
||||
node.textContent = ' '
|
||||
// TODO: remove leading newline from pre, code and textarea?
|
||||
if (isBlockContainingElement(node)) {
|
||||
// remove all text nodes directly inside elements that should only contain blocks
|
||||
for (const childNode of node.childNodes) {
|
||||
if (isTextNode(childNode)) {
|
||||
childNode.remove()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const isElementNode = (node: Node): node is HTMLElement =>
|
||||
node.nodeType === Node.ELEMENT_NODE
|
||||
if (isBlockElement(node)) {
|
||||
trimInlineElements(node)
|
||||
|
||||
if (shouldRemoveEmptyBlockElement(node)) {
|
||||
node.remove()
|
||||
// TODO: and parents?
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const normaliseWhitespace = (documentElement: HTMLElement) => {
|
||||
const walker = document.createTreeWalker(
|
||||
documentElement,
|
||||
NodeFilter.SHOW_TEXT,
|
||||
node =>
|
||||
isElementNode(node) && isElementContainingCode(node)
|
||||
? NodeFilter.FILTER_REJECT
|
||||
: NodeFilter.FILTER_ACCEPT
|
||||
)
|
||||
|
||||
for (let node = walker.nextNode(); node; node = walker.nextNode()) {
|
||||
const text = node.textContent
|
||||
if (text !== null) {
|
||||
if (/^\s+$/.test(text)) {
|
||||
// replace nodes containing only whitespace (including non-breaking space) with a single space
|
||||
node.textContent = ' '
|
||||
} else {
|
||||
// collapse contiguous whitespace (except for non-breaking space) to a single space
|
||||
node.textContent = text.replaceAll(/[\n\r\f\t \u2028\u2029]+/g, ' ')
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: negative lookbehind once Safari supports it
|
||||
const specialCharacterRegExp = /(^|[^\\])([#$%&~_^\\{}])/g
|
||||
|
@ -187,8 +292,8 @@ const specialCharacterReplacer = (
|
|||
}
|
||||
|
||||
const isElementContainingCode = (element: HTMLElement) =>
|
||||
element.tagName === 'CODE' ||
|
||||
(element.tagName === 'PRE' && element.style.fontFamily.includes('monospace'))
|
||||
element.nodeName === 'CODE' ||
|
||||
(element.nodeName === 'PRE' && element.style.fontFamily.includes('monospace'))
|
||||
|
||||
const protectSpecialCharacters = (documentElement: HTMLElement) => {
|
||||
const walker = document.createTreeWalker(
|
||||
|
@ -201,7 +306,7 @@ const protectSpecialCharacters = (documentElement: HTMLElement) => {
|
|||
)
|
||||
|
||||
for (let node = walker.nextNode(); node; node = walker.nextNode()) {
|
||||
if (node.nodeType === Node.TEXT_NODE) {
|
||||
if (isTextNode(node)) {
|
||||
const text = node.textContent
|
||||
if (text) {
|
||||
// replace non-backslash-prefixed characters
|
||||
|
@ -289,34 +394,8 @@ const processLists = (element: HTMLElement) => {
|
|||
}
|
||||
}
|
||||
|
||||
const removeNonContentTextNodes = (table: HTMLTableElement) => {
|
||||
// remove text nodes that are direct children of non-content table elements
|
||||
const containers = table.querySelectorAll('thead,tbody,tr')
|
||||
for (const element of [table, ...containers]) {
|
||||
for (const childNode of element.childNodes) {
|
||||
if (childNode.nodeType === Node.TEXT_NODE) {
|
||||
element.removeChild(childNode)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// remove whitespace-only text nodes at the start or end of table cells
|
||||
for (const element of table.querySelectorAll('th,td')) {
|
||||
for (const childNode of [element.firstChild, element.lastChild]) {
|
||||
if (
|
||||
childNode?.nodeType === Node.TEXT_NODE &&
|
||||
childNode.textContent?.trim() === ''
|
||||
) {
|
||||
element.removeChild(childNode)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const processTables = (element: HTMLElement) => {
|
||||
for (const table of element.querySelectorAll('table')) {
|
||||
removeNonContentTextNodes(table)
|
||||
|
||||
// create a wrapper element for the table and the caption
|
||||
const container = document.createElement('div')
|
||||
container.className = 'ol-table-wrap'
|
||||
|
@ -380,7 +459,7 @@ const processTables = (element: HTMLElement) => {
|
|||
}
|
||||
|
||||
const isTableRow = (element: Element | null): element is HTMLTableRowElement =>
|
||||
element?.tagName === 'TR'
|
||||
element?.nodeName === 'TR'
|
||||
|
||||
const cellAlignment = new Map([
|
||||
['left', 'l'],
|
||||
|
@ -506,7 +585,7 @@ const rowHasBorderStyle = (
|
|||
|
||||
const isTableRowElement = (
|
||||
element: Element | null
|
||||
): element is HTMLTableRowElement => element?.tagName === 'TR'
|
||||
): element is HTMLTableRowElement => element?.nodeName === 'TR'
|
||||
|
||||
const nextRowHasBorderStyle = (
|
||||
element: HTMLTableRowElement,
|
||||
|
@ -645,7 +724,7 @@ const selectors = [
|
|||
// TODO: h6?
|
||||
createSelector({
|
||||
selector: 'br',
|
||||
match: element => element.parentElement?.nodeName !== 'TD', // TODO: why?
|
||||
match: element => !element.closest('table'),
|
||||
start: () => `\n\n`,
|
||||
}),
|
||||
createSelector({
|
||||
|
|
|
@ -66,10 +66,6 @@ export const Latex = (args: any, { globals: { theme } }: any) => {
|
|||
|
||||
useMeta({
|
||||
'ol-showSymbolPalette': true,
|
||||
'ol-splitTestVariants': {
|
||||
'figure-modal': 'enabled',
|
||||
'table-generator': 'enabled',
|
||||
},
|
||||
})
|
||||
|
||||
return <SourceEditor />
|
||||
|
|
|
@ -387,7 +387,7 @@ describe('<CodeMirrorEditor/> paste HTML in Visual mode', function () {
|
|||
cy.get('@content').trigger('paste', { clipboardData })
|
||||
|
||||
cy.get('@content').should('have.text', 'test foobarbaztest')
|
||||
cy.get('.cm-line').should('have.length', 8)
|
||||
cy.get('.cm-line').should('have.length', 7)
|
||||
})
|
||||
|
||||
it('handles pasted paragraphs in list items and table cells', function () {
|
||||
|
@ -410,7 +410,7 @@ describe('<CodeMirrorEditor/> paste HTML in Visual mode', function () {
|
|||
'have.text',
|
||||
'test foobarbaz foo foo foo foofootest'
|
||||
)
|
||||
cy.get('.cm-line').should('have.length', 15)
|
||||
cy.get('.cm-line').should('have.length', 14)
|
||||
})
|
||||
|
||||
it('handles pasted inline code', function () {
|
||||
|
@ -662,6 +662,45 @@ describe('<CodeMirrorEditor/> paste HTML in Visual mode', function () {
|
|||
cy.get('.cm-line').should('have.length', 8)
|
||||
})
|
||||
|
||||
it('tidies whitespace in pasted lists', function () {
|
||||
mountEditor()
|
||||
|
||||
const data = `<ul>
|
||||
<li> foo </li>
|
||||
<li>
|
||||
|
||||
<p>
|
||||
|
||||
<b>test</b></p>
|
||||
<p>test test test
|
||||
test test
|
||||
test test test</p>
|
||||
</li>
|
||||
</ul>`
|
||||
|
||||
const clipboardData = new DataTransfer()
|
||||
clipboardData.setData('text/html', data)
|
||||
cy.get('@content').trigger('paste', { clipboardData })
|
||||
|
||||
cy.get('.cm-line').should('have.length', 6)
|
||||
cy.get('@content').should(
|
||||
'have.text',
|
||||
' foo testtest test test test test test test test'
|
||||
)
|
||||
})
|
||||
|
||||
it('collapses whitespace in adjacent inline elements', function () {
|
||||
mountEditor()
|
||||
|
||||
const data = `<p><b> foo </b><span> test </span><i> bar </i> baz</p>`
|
||||
|
||||
const clipboardData = new DataTransfer()
|
||||
clipboardData.setData('text/html', data)
|
||||
cy.get('@content').trigger('paste', { clipboardData })
|
||||
|
||||
cy.get('@content').should('have.text', 'foo test bar baz')
|
||||
})
|
||||
|
||||
it('treats a pasted image as a figure even if there is HTML', function () {
|
||||
mountEditor()
|
||||
|
||||
|
|
Loading…
Reference in a new issue