[visual] Convert pasted HTML to LaTeX (#12094)

GitOrigin-RevId: a2fd4411ab8655eea834d321025efb9644081252
This commit is contained in:
Alf Eaton 2023-08-11 09:22:27 +01:00 committed by Copybot
parent e90ffd2b54
commit 8f1de5fa09
4 changed files with 656 additions and 0 deletions

View file

@ -689,6 +689,12 @@ const ProjectController = {
cb()
})
},
pasteHtmlAssignment(cb) {
SplitTestHandler.getAssignment(req, res, 'paste-html', () => {
// We'll pick up the assignment from the res.locals assignment.
cb()
})
},
sourceEditorToolbarAssigment(cb) {
SplitTestHandler.getAssignment(
req,

View file

@ -0,0 +1,413 @@
import { EditorView } from '@codemirror/view'
import { EditorSelection, Prec } from '@codemirror/state'
export const pasteHtml = Prec.highest(
EditorView.domEventHandlers({
paste(event, view) {
const { clipboardData } = event
if (!clipboardData?.types.includes('text/html')) {
return false
}
const html = clipboardData.getData('text/html').trim()
if (html.length === 0) {
return false
}
// convert the HTML to LaTeX
try {
const latex = htmlToLaTeX(html)
view.dispatch(
view.state.changeByRange(range => {
return {
range: EditorSelection.cursor(range.from + latex.length),
changes: { from: range.from, to: range.to, insert: latex },
}
})
)
return true
} catch {
// fall back to the default paste handler
return false
}
},
})
)
const removeUnwantedElements = (
documentElement: HTMLElement,
selector: string
) => {
for (const element of documentElement.querySelectorAll(selector)) {
element.remove()
}
}
const htmlToLaTeX = (html: string) => {
const parser = new DOMParser()
const { documentElement } = parser.parseFromString(html, 'text/html')
// remove style elements
removeUnwantedElements(documentElement, 'style')
// protect special characters in non-LaTeX text nodes
protectSpecialCharacters(documentElement)
processMatchedElements(documentElement)
const text = documentElement.textContent
if (!text) {
return ''
}
// normalise multiple newlines
return text.replaceAll(/\n{2,}/g, '\n\n')
}
const protectSpecialCharacters = (documentElement: HTMLElement) => {
for (const element of documentElement.childNodes) {
const text = element.textContent
if (text) {
// if there are no code blocks, use backslash as an indicator of LaTeX code that shouldn't be protected
if (
element instanceof HTMLElement &&
!element.querySelector('code') &&
text.includes('\\')
) {
continue
}
const walker = document.createTreeWalker(element, NodeFilter.SHOW_TEXT)
for (let node = walker.nextNode(); node; node = walker.nextNode()) {
const text = node.textContent
if (text) {
// leave text that's in a code block
if (node.parentElement?.closest('code')) {
continue
}
// replace non-backslash-prefixed characters
node.textContent = text.replaceAll(
/(^|[^\\])([#$%&~_^\\{}])/g,
(_match, prefix: string, char: string) => `${prefix}\\${char}`
)
}
}
}
}
}
const processMatchedElements = (documentElement: HTMLElement) => {
for (const item of selectors) {
for (const element of documentElement.querySelectorAll<any>(
item.selector
)) {
if (!item.match || item.match(element)) {
// start the markup
if (item.start) {
const start = document.createTextNode(item.start(element))
if (item.inside) {
element.prepend(start)
} else {
element.before(start)
}
}
// end the markup
if (item.end) {
const end = document.createTextNode(item.end(element))
if (item.inside) {
element.append(end)
} else {
element.after(end)
}
}
}
}
}
}
const matchingParents = (element: HTMLElement, selector: string) => {
const matches = []
for (
let ancestor = element.parentElement?.closest(selector);
ancestor;
ancestor = ancestor.parentElement?.closest(selector)
) {
matches.push(ancestor)
}
return matches
}
const tabular = (element: HTMLTableElement) => {
const options = []
// NOTE: only analysing cells in the first row
const row = element.querySelector('tr')
if (row) {
// TODO: look for horizontal borders and insert \hline (or \toprule, \midrule, \bottomrule etc)?
const cells = [...row.childNodes].filter(
element => element.nodeName === 'TD' || element.nodeName === 'TH'
) as Array<HTMLTableCellElement | HTMLTableHeaderCellElement>
for (const cell of cells) {
const { borderLeft, textAlign, borderRight } = cell.style
if (borderLeft && borderLeft !== 'none') {
// avoid duplicating when both left and right borders are defined
if (options.length === 0 || options[options.length - 1] !== '|') {
options.push('|')
}
}
options.push(
textAlign === 'left' ? 'l' : textAlign === 'right' ? 'r' : 'c'
)
if (borderRight && borderRight !== 'none') {
options.push('|')
}
}
}
return options.join(' ')
}
const listDepth = (
element: HTMLOListElement | HTMLUListElement | HTMLLIElement
): number => Math.max(0, matchingParents(element, 'ul,ol').length - 1)
const listIndent = (
element: HTMLOListElement | HTMLUListElement | HTMLLIElement
): string => '\t'.repeat(listDepth(element))
type ElementSelector<T extends string, E extends HTMLElement = HTMLElement> = {
selector: T
match?: (element: E) => boolean
start?: (element: E) => string
end?: (element: E) => string
inside?: boolean
}
const createSelector = <
T extends string,
E extends HTMLElement = T extends keyof HTMLElementTagNameMap
? HTMLElementTagNameMap[T]
: HTMLElement
>({
selector,
...elementSelector
}: ElementSelector<T, E>) => ({
selector,
...elementSelector,
})
const headings = ['H1', 'H2', 'H3', 'H4', 'H5', 'H6']
const isHeading = (element: HTMLElement | null) => {
return element && headings.includes(element.nodeName)
}
const hasContent = (element: HTMLElement): boolean => {
return Boolean(element.textContent && element.textContent.trim().length > 0)
}
export const selectors = [
createSelector({
selector: 'b',
match: element =>
element.style.fontWeight !== 'normal' &&
!isHeading(element.parentElement) &&
hasContent(element),
start: () => '\\textbf{',
end: () => '}',
}),
createSelector({
selector: '*',
match: element =>
parseInt(element.style.fontWeight) > 400 && hasContent(element),
start: () => '\\textbf{',
end: () => '}',
inside: true,
}),
createSelector({
selector: 'i',
match: element =>
element.style.fontStyle !== 'normal' && hasContent(element),
start: () => '\\textit{',
end: () => '}',
}),
createSelector({
selector: '*',
match: element =>
element.style.fontStyle === 'italic' && hasContent(element),
start: () => '\\textit{',
end: () => '}',
}),
createSelector({
selector: 'sup',
match: element => hasContent(element),
start: () => '\\textsuperscript{',
end: () => '}',
}),
createSelector({
selector: 'span',
match: element =>
element.style.verticalAlign === 'super' && hasContent(element),
start: () => '\\textsuperscript{',
end: () => '}',
}),
createSelector({
selector: 'sub',
match: element => hasContent(element),
start: () => '\\textsubscript{',
end: () => '}',
}),
createSelector({
selector: 'span',
match: element =>
element.style.verticalAlign === 'sub' && hasContent(element),
start: () => '\\textsubscript{',
end: () => '}',
}),
createSelector({
selector: 'a',
match: element => !!element.href && hasContent(element),
start: (element: HTMLAnchorElement) => `\\href{${element.href}}{`,
end: element => `}`,
}),
createSelector({
selector: 'h1',
match: element => !element.closest('table') && hasContent(element),
start: () => `\n\n\\section{`,
end: () => `}\n\n`,
}),
createSelector({
selector: 'h2',
match: element => !element.closest('table') && hasContent(element),
start: () => `\n\n\\subsection{`,
end: () => `}\n\n`,
}),
createSelector({
selector: 'h3',
match: element => !element.closest('table') && hasContent(element),
start: () => `\n\n\\subsubsection{`,
end: () => `}\n\n`,
}),
createSelector({
selector: 'h4',
match: element => !element.closest('table') && hasContent(element),
start: () => `\n\n\\paragraph{`,
end: () => `}\n\n`,
}),
createSelector({
selector: 'h5',
match: element => !element.closest('table') && hasContent(element),
start: () => `\n\n\\subparagraph{`,
end: () => `}\n\n`,
}),
// TODO: h6?
createSelector({
selector: 'br',
match: element => element.parentElement?.nodeName !== 'TD', // TODO: why?
start: () => `\n\n`,
}),
createSelector({
selector: 'code',
match: element =>
element.parentElement?.nodeName !== 'PRE' && hasContent(element),
start: () => `\\verb|`,
end: () => `|`,
}),
createSelector({
selector: 'pre > code',
match: element => hasContent(element),
start: () => `\n\n\\begin{verbatim}\n`,
end: () => `\n\\end{verbatim}\n\n`,
}),
createSelector({
selector: 'table',
start: element =>
`\n\n\\begin{table}\n\\centering\n\\begin{tabular}{${tabular(
element
)}}\n`,
end: () => `\n\\end{tabular}\n\\end{table}\n\n`,
}),
createSelector({
selector: 'thead',
start: () => `\n`,
end: () => `\n`,
}),
createSelector({
selector: 'tfoot',
start: () => `\n`,
end: () => `\n`,
}),
createSelector({
selector: 'tbody',
start: () => `\n`,
end: () => `\n`,
}),
createSelector({
selector: 'tr',
match: element => element.nextElementSibling?.nodeName === 'TR',
end: () => `\n`,
}),
createSelector({
selector: 'tr > td:not(:last-child), tr > th:not(:last-child)',
start: element => {
const colspan = element.getAttribute('colspan')
return colspan ? `\\multicolumn{${Number(colspan)}}{` : ''
},
end: element => {
const colspan = element.getAttribute('colspan')
return colspan ? `} & ` : ` & `
},
}),
createSelector({
selector: 'tr > td:last-child, tr > th:last-child',
start: element => {
const colspan = element.getAttribute('colspan')
return colspan ? `\\multicolumn{${Number(colspan)}}{` : ''
},
end: element => {
const colspan = element.getAttribute('colspan')
return colspan ? `} \\\\` : ` \\\\`
},
}),
createSelector({
selector: 'table > caption',
start: () => `\n\n\\caption{\\label{tab:example}`,
end: () => `}\n\n`,
}),
createSelector({
selector: 'ul',
start: element => `\n\n${listIndent(element)}\\begin{itemize}`,
end: element => `\n${listIndent(element)}\\end{itemize}\n`,
}),
createSelector({
selector: 'ol',
start: element => `\n\n${listIndent(element)}\\begin{enumerate}`,
end: element => `\n${listIndent(element)}\\end{enumerate}\n`,
}),
createSelector({
selector: 'li',
start: element => `\n${listIndent(element)}\t\\item `,
}),
createSelector({
selector: 'p',
match: element =>
element.nextElementSibling?.nodeName === 'P' && hasContent(element),
end: () => '\n\n',
}),
]

View file

@ -24,6 +24,7 @@ import { figureModalPasteHandler } from '../figure-modal'
import { isSplitTestEnabled } from '../../../../utils/splitTestUtils'
import { toolbarPanel } from '../toolbar/toolbar-panel'
import { selectDecoratedArgument } from './select-decorated-argument'
import { pasteHtml } from './paste-html'
type Options = {
visual: boolean
@ -204,4 +205,5 @@ const extension = (options: Options) => [
selectDecoratedArgument,
showContentWhenParsed,
figureModalPasteHandler(),
isSplitTestEnabled('paste-html') ? pasteHtml : [],
]

View file

@ -0,0 +1,235 @@
import { FC } from 'react'
import { EditorProviders } from '../../../helpers/editor-providers'
import CodemirrorEditor from '../../../../../frontend/js/features/source-editor/components/codemirror-editor'
import { mockScope } from '../helpers/mock-scope'
const Container: FC = ({ children }) => (
<div style={{ width: 785, height: 785 }}>{children}</div>
)
const mountEditor = (content = '') => {
const scope = mockScope(content)
scope.editor.showVisual = true
cy.mount(
<Container>
<EditorProviders scope={scope}>
<CodemirrorEditor />
</EditorProviders>
</Container>
)
// wait for the content to be parsed and revealed
cy.get('.cm-content').as('content')
cy.get('@content').should('have.css', 'opacity', '1')
}
describe('<CodeMirrorEditor/> paste HTML in Visual mode', function () {
beforeEach(function () {
window.metaAttributesCache.set('ol-preventCompileOnLoad', true)
window.metaAttributesCache.set('ol-splitTestVariants', {
'paste-html': 'enabled',
})
cy.interceptEvents()
cy.interceptSpelling()
})
it('handles paste', function () {
mountEditor()
const data = 'foo'
const clipboardData = new DataTransfer()
clipboardData.setData('text/html', data)
cy.spy(clipboardData, 'getData').as('get-data')
cy.get('@content').trigger('paste', { clipboardData })
cy.get('@content').should('have.text', 'foo')
cy.get('@get-data').should('have.been.calledOnceWithExactly', 'text/html')
})
it('handles a pasted bullet list', function () {
mountEditor()
const data = '<ul><li>foo</li><li>bar</li></ul>'
const clipboardData = new DataTransfer()
clipboardData.setData('text/html', data)
cy.spy(clipboardData, 'getData').as('get-data')
cy.get('@content').trigger('paste', { clipboardData })
cy.get('@content').should('have.text', ' foo bar')
cy.get('.ol-cm-item').should('have.length', 2)
})
it('handles a pasted numbered list', function () {
mountEditor()
const data = '<ol><li>foo</li><li>bar</li></ol>'
const clipboardData = new DataTransfer()
clipboardData.setData('text/html', data)
cy.spy(clipboardData, 'getData').as('get-data')
cy.get('@content').trigger('paste', { clipboardData })
cy.get('@content').should('have.text', ' foo bar')
cy.get('.ol-cm-item').should('have.length', 2)
})
it('handles a pasted simple table', function () {
mountEditor()
const data =
'<table><tbody><tr><td>foo</td><td>bar</td></tr></tbody></table>'
const clipboardData = new DataTransfer()
clipboardData.setData('text/html', data)
cy.spy(clipboardData, 'getData').as('get-data')
cy.get('@content').trigger('paste', { clipboardData })
cy.get('@content').should(
'have.text',
'\\begin{tabular}{c c}foo & bar ↩\\end{tabular}'
)
})
it('handles a pasted simple table with borders', function () {
mountEditor()
const data =
'<table><tbody><tr><td style="border-left:1px solid black;border-right:1px solid black">foo</td><td style="border-left:1px solid black;border-right:1px solid black">bar</td></tr></tbody></table>'
const clipboardData = new DataTransfer()
clipboardData.setData('text/html', data)
cy.spy(clipboardData, 'getData').as('get-data')
cy.get('@content').trigger('paste', { clipboardData })
cy.get('@content').should(
'have.text',
'\\begin{tabular}{| c | c |}foo & bar ↩\\end{tabular}'
)
})
it('handles a pasted table with merged cells', function () {
mountEditor()
const data = [
`<table><tbody>`,
`<tr><td>test</td><td>test</td><td>test</td></tr>`,
`<tr><td colspan="2">test</td><td>test</td></tr>`,
`<tr><td>test</td><td colspan="2">test</td></tr>`,
`</tbody></table>`,
].join('')
const clipboardData = new DataTransfer()
clipboardData.setData('text/html', data)
cy.spy(clipboardData, 'getData').as('get-data')
cy.get('@content').trigger('paste', { clipboardData })
cy.get('@content').should(
'have.text',
'\\begin{tabular}{c c c}test & test & test ↩\\multicolumn{2}{test} & test ↩test & \\multicolumn{2}{test} ↩\\end{tabular}'
)
})
it('handles a pasted link', function () {
mountEditor()
const data = '<a href="https://example.com/">foo</a>'
const clipboardData = new DataTransfer()
clipboardData.setData('text/html', data)
cy.spy(clipboardData, 'getData').as('get-data')
cy.get('@content').trigger('paste', { clipboardData })
cy.get('@content').should('have.text', '\\href{https://example.com/}{foo}')
cy.get('.ol-cm-command-href').should('have.length', 1)
})
it('handles a pasted code block', function () {
mountEditor()
const data = '<pre><code>foo</a></pre>'
const clipboardData = new DataTransfer()
clipboardData.setData('text/html', data)
cy.spy(clipboardData, 'getData').as('get-data')
cy.get('@content').trigger('paste', { clipboardData })
cy.get('@content').should('have.text', 'foo')
cy.get('.ol-cm-environment-verbatim').should('have.length', 5)
cy.get('.cm-line').eq(2).click()
cy.get('@content').should(
'have.text',
'\\begin{verbatim}foo\\end{verbatim}'
)
})
it('handles pasted inline code', function () {
mountEditor()
const data = '<code>foo</a>'
const clipboardData = new DataTransfer()
clipboardData.setData('text/html', data)
cy.spy(clipboardData, 'getData').as('get-data')
cy.get('@content').trigger('paste', { clipboardData })
cy.get('@content').should('have.text', '\\verb|foo|')
cy.get('.ol-cm-command-verb').should('have.length', 1)
})
it('handles pasted text with formatting', function () {
mountEditor()
const data = '<b>foo</b><sup>th</sup> <i>bar</i><sub>2</sub>'
const clipboardData = new DataTransfer()
clipboardData.setData('text/html', data)
cy.spy(clipboardData, 'getData').as('get-data')
cy.get('@content').trigger('paste', { clipboardData })
cy.get('@content').should(
'have.text',
'foo\\textsuperscript{th} bar\\textsubscript{2}'
)
cy.get('.ol-cm-command-textbf').should('have.length', 1)
cy.get('.ol-cm-command-textit').should('have.length', 1)
})
it('protects special characters', function () {
mountEditor()
const data = 'foo & bar~baz'
const clipboardData = new DataTransfer()
clipboardData.setData('text/html', data)
cy.spy(clipboardData, 'getData').as('get-data')
cy.get('@content').trigger('paste', { clipboardData })
cy.get('@content').should('have.text', 'foo & bar~baz')
cy.get('.ol-cm-character').should('have.length', 2)
})
it('does not protect special characters in code blocks', function () {
mountEditor()
const data = 'foo & bar~baz <code>\\textbf{foo}</code>'
const clipboardData = new DataTransfer()
clipboardData.setData('text/html', data)
cy.spy(clipboardData, 'getData').as('get-data')
cy.get('@content').trigger('paste', { clipboardData })
cy.get('@content').should(
'have.text',
'foo & bar~baz \\verb|\\textbf{foo}|'
)
cy.get('.cm-line').eq(0).type('{Enter}')
cy.get('@content').should('have.text', 'foo & bar~baz \\textbf{foo}')
cy.get('.ol-cm-character').should('have.length', 2)
cy.get('.ol-cm-command-verb').should('have.length', 1)
})
})