Improve handling of whitespace in pasted HTML (#15074)

GitOrigin-RevId: 48876707e15e1ccd1bb71ce01121033d0b0dbeaf
2024-11-29 02:43:40 -05:00 · 2023-10-09 09:52:41 +01:00 · 2023-10-09 09:52:41 +01:00 · 31033224d5
commit 31033224d5
parent 1e85736f69
4 changed files with 282 additions and 61 deletions
--- a/services/web/frontend/js/features/source-editor/extensions/visual/html-elements.ts
+++ b/services/web/frontend/js/features/source-editor/extensions/visual/html-elements.ts
@ -0,0 +1,107 @@
+// elements which should contain only block elements
+const blockContainingElements = new Set([
+  'DL',
+  'FIELDSET',
+  'FIGURE',
+  'HEAD',
+  'OL',
+  'TABLE',
+  'TBODY',
+  'TFOOT',
+  'THEAD',
+  'TR',
+  'UL',
+])
+
+export const isBlockContainingElement = (node: Node): node is HTMLElement =>
+  blockContainingElements.has(node.nodeName)
+
+// elements which are block elements (as opposed to inline elements)
+const blockElements = new Set([
+  'ADDRESS',
+  'ARTICLE',
+  'ASIDE',
+  'BLOCKQUOTE',
+  'BODY',
+  'CANVAS',
+  'DD',
+  'DIV',
+  'DL',
+  'DT',
+  'FIELDSET',
+  'FIGCAPTION',
+  'FIGURE',
+  'FOOTER',
+  'FORM',
+  'H1',
+  'H2',
+  'H3',
+  'H4',
+  'H5',
+  'H6',
+  'HEADER',
+  'HGROUP',
+  'HR',
+  'LI',
+  'MAIN',
+  'NAV',
+  'NOSCRIPT',
+  'OL',
+  'P',
+  'PRE',
+  'SECTION',
+  'TABLE',
+  'TBODY',
+  'TD',
+  'TFOOT',
+  'TH',
+  'THEAD',
+  'TR',
+  'UL',
+  'VIDEO',
+])
+
+export const isBlockElement = (node: Node): node is HTMLElement =>
+  blockElements.has(node.nodeName)
+
+const inlineElements = new Set([
+  'A',
+  'ABBR',
+  'ACRONYM',
+  'B',
+  'BIG',
+  'CITE',
+  'DEL',
+  'EM',
+  'I',
+  'INS',
+  'SMALL',
+  'SPAN',
+  'STRONG',
+  'SUB',
+  'SUP',
+  'TEXTAREA', // TODO
+  'TIME',
+  'TT',
+])
+
+export const isInlineElement = (node: Node): node is HTMLElement =>
+  inlineElements.has(node.nodeName)
+
+const codeElements = new Set(['CODE', 'PRE'])
+
+export const isCodeElement = (node: Node): node is HTMLElement =>
+  codeElements.has(node.nodeName)
+
+const keepEmptyBlockElements = new Set(['TD', 'TH', 'CANVAS', 'DT', 'DD', 'HR'])
+
+export const shouldRemoveEmptyBlockElement = (
+  node: Node
+): node is HTMLElement =>
+  !keepEmptyBlockElements.has(node.nodeName) && !node.hasChildNodes()
+
+export const isTextNode = (node: Node): node is Text =>
+  node.nodeType === Node.TEXT_NODE
+
+export const isElementNode = (node: Node): node is HTMLElement =>
+  node.nodeType === Node.ELEMENT_NODE
--- a/services/web/frontend/js/features/source-editor/extensions/visual/paste-html.ts
+++ b/services/web/frontend/js/features/source-editor/extensions/visual/paste-html.ts
@ -6,6 +6,14 @@ import {
  storePastedContent,
 } from './pasted-content'
 import { debugConsole } from '@/utils/debugging'
+import {
+  isBlockContainingElement,
+  isBlockElement,
+  isElementNode,
+  isInlineElement,
+  isTextNode,
+  shouldRemoveEmptyBlockElement,
+} from './html-elements'

 export const pasteHtml = [
  Prec.highest(
@ -49,12 +57,18 @@ export const pasteHtml = [
            return false
          }

-          // if the only content is in a code block, use the plain text version
-          if (onlyCode(documentElement)) {
+          const bodyElement = documentElement.querySelector('body')
+          // DOMParser should always create a body element, so this is mostly for TypeScript
+          if (!bodyElement) {
            return false
          }

-          const latex = htmlToLaTeX(documentElement)
+          // if the only content is in a code block, use the plain text version
+          if (onlyCode(bodyElement)) {
+            return false
+          }
+
+          const latex = htmlToLaTeX(bodyElement)

          // if there's no formatting, use the plain text version
          if (latex === text && clipboardData.files.length === 0) {
@ -121,25 +135,38 @@ const hasProgId = (documentElement: HTMLElement) => {
  return meta && meta.content.trim().length > 0
 }

-const htmlToLaTeX = (documentElement: HTMLElement) => {
+const htmlToLaTeX = (bodyElement: HTMLElement) => {
  // remove style elements
-  removeUnwantedElements(documentElement, 'style')
+  removeUnwantedElements(bodyElement, 'style')

-  // replace non-breaking spaces added by Chrome on copy
-  processWhitespace(documentElement)
+  let before: string | null = null
+  let after: string | null = null
+
+  // repeat until the content stabilises
+  do {
+    before = bodyElement.textContent
+
+    // normalise whitespace in text
+    normaliseWhitespace(bodyElement)
+
+    // replace unwanted whitespace in blocks
+    processWhitespaceInBlocks(bodyElement)
+
+    after = bodyElement.textContent
+  } while (before !== after)

  // pre-process table elements
-  processTables(documentElement)
+  processTables(bodyElement)

  // pre-process lists
-  processLists(documentElement)
+  processLists(bodyElement)

  // protect special characters in non-LaTeX text nodes
-  protectSpecialCharacters(documentElement)
+  protectSpecialCharacters(bodyElement)

-  processMatchedElements(documentElement)
+  processMatchedElements(bodyElement)

-  const text = documentElement.textContent
+  const text = bodyElement.textContent

  if (!text) {
    return ''
@ -151,24 +178,102 @@ const htmlToLaTeX = (documentElement: HTMLElement) => {
      .replaceAll('', '')
      // normalise multiple newlines
      .replaceAll(/\n{2,}/g, '\n\n')
+      // only allow a single newline at the start and end
+      .replaceAll(/(^\n+|\n+$)/g, '\n')
+      // replace tab with 4 spaces (hard-coded indent unit)
+      .replaceAll('\t', '    ')
  )
 }

-const processWhitespace = (documentElement: HTMLElement) => {
+const trimInlineElements = (
+  element: HTMLElement,
+  precedingSpace = true
+): boolean => {
+  for (const node of element.childNodes) {
+    if (isTextNode(node)) {
+      let text = node.textContent!
+
+      if (precedingSpace) {
+        text = text.replace(/^\s+/, '')
+      }
+
+      if (text === '') {
+        node.remove()
+      } else {
+        node.textContent = text
+        precedingSpace = /\s$/.test(text)
+      }
+    } else if (isInlineElement(node)) {
+      precedingSpace = trimInlineElements(node, precedingSpace)
+    } else if (isBlockElement(node)) {
+      precedingSpace = true // TODO
+    } else {
+      precedingSpace = false // TODO
+    }
+  }
+
+  // TODO: trim whitespace at the end
+
+  return precedingSpace
+}
+
+const processWhitespaceInBlocks = (documentElement: HTMLElement) => {
+  trimInlineElements(documentElement)
+
  const walker = document.createTreeWalker(
    documentElement,
-    NodeFilter.SHOW_TEXT
+    NodeFilter.SHOW_ELEMENT,
+    node =>
+      isElementNode(node) && isElementContainingCode(node)
+        ? NodeFilter.FILTER_REJECT
+        : NodeFilter.FILTER_ACCEPT
  )

  for (let node = walker.nextNode(); node; node = walker.nextNode()) {
-    if (node.textContent === ' ') {
-      node.textContent = ' '
+    // TODO: remove leading newline from pre, code and textarea?
+    if (isBlockContainingElement(node)) {
+      // remove all text nodes directly inside elements that should only contain blocks
+      for (const childNode of node.childNodes) {
+        if (isTextNode(childNode)) {
+          childNode.remove()
        }
      }
    }

-const isElementNode = (node: Node): node is HTMLElement =>
-  node.nodeType === Node.ELEMENT_NODE
+    if (isBlockElement(node)) {
+      trimInlineElements(node)
+
+      if (shouldRemoveEmptyBlockElement(node)) {
+        node.remove()
+        // TODO: and parents?
+      }
+    }
+  }
+}
+
+const normaliseWhitespace = (documentElement: HTMLElement) => {
+  const walker = document.createTreeWalker(
+    documentElement,
+    NodeFilter.SHOW_TEXT,
+    node =>
+      isElementNode(node) && isElementContainingCode(node)
+        ? NodeFilter.FILTER_REJECT
+        : NodeFilter.FILTER_ACCEPT
+  )
+
+  for (let node = walker.nextNode(); node; node = walker.nextNode()) {
+    const text = node.textContent
+    if (text !== null) {
+      if (/^\s+$/.test(text)) {
+        // replace nodes containing only whitespace (including non-breaking space) with a single space
+        node.textContent = ' '
+      } else {
+        // collapse contiguous whitespace (except for non-breaking space) to a single space
+        node.textContent = text.replaceAll(/[\n\r\f\t \u2028\u2029]+/g, ' ')
+      }
+    }
+  }
+}

 // TODO: negative lookbehind once Safari supports it
 const specialCharacterRegExp = /(^|[^\\])([#$%&~_^\\{}])/g
@ -187,8 +292,8 @@ const specialCharacterReplacer = (
 }

 const isElementContainingCode = (element: HTMLElement) =>
-  element.tagName === 'CODE' ||
-  (element.tagName === 'PRE' && element.style.fontFamily.includes('monospace'))
+  element.nodeName === 'CODE' ||
+  (element.nodeName === 'PRE' && element.style.fontFamily.includes('monospace'))

 const protectSpecialCharacters = (documentElement: HTMLElement) => {
  const walker = document.createTreeWalker(
@ -201,7 +306,7 @@ const protectSpecialCharacters = (documentElement: HTMLElement) => {
  )

  for (let node = walker.nextNode(); node; node = walker.nextNode()) {
-    if (node.nodeType === Node.TEXT_NODE) {
+    if (isTextNode(node)) {
      const text = node.textContent
      if (text) {
        // replace non-backslash-prefixed characters
@ -289,34 +394,8 @@ const processLists = (element: HTMLElement) => {
  }
 }

-const removeNonContentTextNodes = (table: HTMLTableElement) => {
-  // remove text nodes that are direct children of non-content table elements
-  const containers = table.querySelectorAll('thead,tbody,tr')
-  for (const element of [table, ...containers]) {
-    for (const childNode of element.childNodes) {
-      if (childNode.nodeType === Node.TEXT_NODE) {
-        element.removeChild(childNode)
-      }
-    }
-  }
-
-  // remove whitespace-only text nodes at the start or end of table cells
-  for (const element of table.querySelectorAll('th,td')) {
-    for (const childNode of [element.firstChild, element.lastChild]) {
-      if (
-        childNode?.nodeType === Node.TEXT_NODE &&
-        childNode.textContent?.trim() === ''
-      ) {
-        element.removeChild(childNode)
-      }
-    }
-  }
-}
-
 const processTables = (element: HTMLElement) => {
  for (const table of element.querySelectorAll('table')) {
-    removeNonContentTextNodes(table)
-
    // create a wrapper element for the table and the caption
    const container = document.createElement('div')
    container.className = 'ol-table-wrap'
@ -380,7 +459,7 @@ const processTables = (element: HTMLElement) => {
 }

 const isTableRow = (element: Element | null): element is HTMLTableRowElement =>
-  element?.tagName === 'TR'
+  element?.nodeName === 'TR'

 const cellAlignment = new Map([
  ['left', 'l'],
@ -506,7 +585,7 @@ const rowHasBorderStyle = (

 const isTableRowElement = (
  element: Element | null
-): element is HTMLTableRowElement => element?.tagName === 'TR'
+): element is HTMLTableRowElement => element?.nodeName === 'TR'

 const nextRowHasBorderStyle = (
  element: HTMLTableRowElement,
@ -645,7 +724,7 @@ const selectors = [
  // TODO: h6?
  createSelector({
    selector: 'br',
-    match: element => element.parentElement?.nodeName !== 'TD', // TODO: why?
+    match: element => !element.closest('table'),
    start: () => `\n\n`,
  }),
  createSelector({
--- a/services/web/frontend/stories/source-editor/source-editor.stories.tsx
+++ b/services/web/frontend/stories/source-editor/source-editor.stories.tsx
@ -66,10 +66,6 @@ export const Latex = (args: any, { globals: { theme } }: any) => {

  useMeta({
    'ol-showSymbolPalette': true,
-    'ol-splitTestVariants': {
-      'figure-modal': 'enabled',
-      'table-generator': 'enabled',
-    },
  })

  return <SourceEditor />
--- a/services/web/test/frontend/features/source-editor/components/codemirror-editor-visual-paste-html.spec.tsx
+++ b/services/web/test/frontend/features/source-editor/components/codemirror-editor-visual-paste-html.spec.tsx
@ -387,7 +387,7 @@ describe('<CodeMirrorEditor/> paste HTML in Visual mode', function () {
    cy.get('@content').trigger('paste', { clipboardData })

    cy.get('@content').should('have.text', 'test foobarbaztest')
-    cy.get('.cm-line').should('have.length', 8)
+    cy.get('.cm-line').should('have.length', 7)
  })

  it('handles pasted paragraphs in list items and table cells', function () {
@ -410,7 +410,7 @@ describe('<CodeMirrorEditor/> paste HTML in Visual mode', function () {
      'have.text',
      'test foobarbaz foo foo foo foofootest'
    )
-    cy.get('.cm-line').should('have.length', 15)
+    cy.get('.cm-line').should('have.length', 14)
  })

  it('handles pasted inline code', function () {
@ -662,6 +662,45 @@ describe('<CodeMirrorEditor/> paste HTML in Visual mode', function () {
    cy.get('.cm-line').should('have.length', 8)
  })

+  it('tidies whitespace in pasted lists', function () {
+    mountEditor()
+
+    const data = `<ul>
+<li>  foo  </li>
+   <li>
+
+  <p>
+
+  <b>test</b></p>
+<p>test test test
+test test
+test test test</p>
+    </li>
+</ul>`
+
+    const clipboardData = new DataTransfer()
+    clipboardData.setData('text/html', data)
+    cy.get('@content').trigger('paste', { clipboardData })
+
+    cy.get('.cm-line').should('have.length', 6)
+    cy.get('@content').should(
+      'have.text',
+      ' foo  testtest test test test test test test test'
+    )
+  })
+
+  it('collapses whitespace in adjacent inline elements', function () {
+    mountEditor()
+
+    const data = `<p><b> foo </b><span> test </span><i> bar </i> baz</p>`
+
+    const clipboardData = new DataTransfer()
+    clipboardData.setData('text/html', data)
+    cy.get('@content').trigger('paste', { clipboardData })
+
+    cy.get('@content').should('have.text', 'foo test bar baz')
+  })
+
  it('treats a pasted image as a figure even if there is HTML', function () {
    mountEditor()