mirror of
https://github.com/overleaf/overleaf.git
synced 2025-01-23 05:42:55 +00:00
903c1110e0
[learn] sanitize the wiki content GitOrigin-RevId: c114bbd94479e926c0621953fe9c03f6d380a19d
120 lines
3.3 KiB
JavaScript
120 lines
3.3 KiB
JavaScript
const crypto = require('crypto')
|
|
const fs = require('fs')
|
|
const Path = require('path')
|
|
|
|
const cheerio = require('cheerio')
|
|
const prettier = require('prettier')
|
|
const sanitizeHtml = require('sanitize-html')
|
|
|
|
const {
|
|
sanitizeOptions,
|
|
} = require('../../../modules/learn/app/src/sanitizeOptions')
|
|
|
|
const EXTRACT_STYLE = process.env.EXTRACT_STYLES === 'true'
|
|
const OMIT_STYLE = process.env.OMIT_STYLE !== 'false'
|
|
const DUMP_CSS_IN = Path.join(
|
|
Path.dirname(Path.dirname(Path.dirname(__dirname))),
|
|
'data',
|
|
'dumpFolder'
|
|
)
|
|
|
|
function hash(blob) {
|
|
return crypto.createHash('sha1').update(blob).digest('hex')
|
|
}
|
|
|
|
function normalize(blob, title) {
|
|
// styles are dropped in web and kept in wiki pages for previewing there.
|
|
blob = blob.replace(/<style>(.+?)<\/style>/gs, (_, match) => {
|
|
if (EXTRACT_STYLE) {
|
|
// normalize css with prettier
|
|
const css = prettier.format(match, { parser: 'css' })
|
|
fs.writeFileSync(
|
|
Path.join(DUMP_CSS_IN, `${hash(css)}-${encodeURIComponent(title)}.css`),
|
|
`/* title: ${title} */\n\n${css}`
|
|
)
|
|
}
|
|
if (OMIT_STYLE) {
|
|
return ''
|
|
}
|
|
return match
|
|
})
|
|
|
|
// strip comments:
|
|
// - comment at the bottom of each page
|
|
blob = blob.replace(/<!-- \nNewPP limit report.+/s, '')
|
|
// - annotation of math characters
|
|
blob = blob.replace(/<!-- . -->/g, '')
|
|
|
|
// wrap for consistent rendering
|
|
if (blob.indexOf('<html><head>') !== 0) {
|
|
blob = `<html><head>${blob}</head></html>`
|
|
}
|
|
|
|
// normalize inline style:
|
|
// - drop trailing ;
|
|
blob = blob.replace(/style="([^"]+);"/g, (_, style) => `style="${style}"`)
|
|
// - normalize whitespace
|
|
blob = blob.replace(
|
|
/style="([^"]+)"/g,
|
|
(_, style) => `style="${style.trim().replace(/([:;])\s+/g, '$1')}"`
|
|
)
|
|
|
|
// let cherrio do another pass
|
|
return cheerio.load(blob).html()
|
|
}
|
|
|
|
function toText(blob) {
|
|
return cheerio.load(blob).text()
|
|
}
|
|
|
|
const zoomOut = 50
|
|
function peak(content, offset) {
|
|
// show some more content before/after the mismatch
|
|
if (offset > zoomOut) {
|
|
offset -= zoomOut
|
|
}
|
|
// wrap in JSON to escape new line characters
|
|
return JSON.stringify(content.slice(offset, offset + chunkSize + 2 * zoomOut))
|
|
}
|
|
|
|
const chunkSize = 100
|
|
function findFirstMismatch(a, b) {
|
|
if (a === b) return a.length
|
|
let i = 0
|
|
while (
|
|
a.length > chunkSize &&
|
|
b.length > chunkSize &&
|
|
a.slice(0, chunkSize) === b.slice(0, chunkSize)
|
|
) {
|
|
i++
|
|
a = a.slice(chunkSize)
|
|
b = b.slice(chunkSize)
|
|
}
|
|
return i * chunkSize
|
|
}
|
|
|
|
function checkSanitizeOptions(page, title, text) {
|
|
text = normalize(text, title)
|
|
const sanitized = normalize(sanitizeHtml(text, sanitizeOptions))
|
|
if (text === sanitized) return
|
|
|
|
const offset = findFirstMismatch(text, sanitized)
|
|
|
|
const textToText = toText(text)
|
|
const sanitizedToText = toText(sanitized)
|
|
const offsetText = findFirstMismatch(textToText, sanitizedToText)
|
|
|
|
console.error('---')
|
|
console.error('page :', page)
|
|
console.error('title :', title)
|
|
console.error('match :', text === sanitized)
|
|
console.error('toText :', toText(text) === toText(sanitized))
|
|
console.error('text :', peak(text, offset))
|
|
console.error('sanitized :', peak(sanitized, offset))
|
|
console.error('textToText :', peak(textToText, offsetText))
|
|
console.error('sanitizedToText:', peak(sanitizedToText, offsetText))
|
|
}
|
|
|
|
module.exports = {
|
|
checkSanitizeOptions,
|
|
}
|