Merge pull request #3998 from overleaf/jpa-learn-sanitize

[learn] sanitize the wiki content

GitOrigin-RevId: c114bbd94479e926c0621953fe9c03f6d380a19d
This commit is contained in:
Hugh O'Brien 2021-05-07 14:16:12 +01:00 committed by Copybot
parent 1f3b01486e
commit 903c1110e0
5 changed files with 342 additions and 5 deletions

View file

@ -14,11 +14,6 @@
}
}
img {
height: auto;
max-width: 100%;
}
.page-header {
a {
font-size: 0.8em;
@ -235,4 +230,45 @@
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.35);
}
}
/* Keep the below rules in sync with the wiki pages */
img {
height: auto;
max-width: 100%;
}
img.add-vertical-space {
padding-bottom: 20px;
padding-top: 20px;
}
th.no-wrap {
white-space: nowrap;
text-align: left;
}
/* LATEX and TEX artwork */
span.TEX {
letter-spacing: -0.125em;
padding-right: 0.5ex;
}
span.TEX span.E {
position: relative;
top: 0.5ex;
padding-right: 0.1ex;
}
a span.TEX span.E {
text-decoration: none;
}
span.LATEX span.A {
position: relative;
top: -0.5ex;
left: -0.4em;
font-size: 75%;
}
span.LATEX span.TEX {
position: relative;
left: -0.4em;
margin-right: -0.5ex;
}
}

View file

@ -0,0 +1,32 @@
# Usage
```
node scripts/learn/checkSanitize https://LEARN_WIKI
```
## Bulk export
There is a bulk export for media wiki pages, but it produces different
html escaping compared to the regular parse API we use in web.
The bulk export does not escape all the placeholder HTML-like elements,
like `<project-id` or `<document goes here>`.
## Example output
Here is how a missing tag gets flagged:
```
---
page : MediaWiki markup for the Overleaf support team
title : MediaWiki markup for the Overleaf support team
match : false
toText : false
text : "Overleaf</strong></td>\n </tr>\n <tr><td>Kb/<strong>TITLE_SLUG</strong></td><td><nowiki>https://www.overleaf.com/learn/how-to/</nowiki><strong>TITLE_SLUG</strong></td>\n </"
sanitized : "Overleaf</strong></td>\n </tr>\n <tr><td>Kb/<strong>TITLE_SLUG</strong></td><td>&lt;nowiki&gt;https://www.overleaf.com/learn/how-to/&lt;/nowiki&gt;<strong>TITLE_SLUG</strong></td>\n "
textToText : " \n \n \n \n MediaWiki page\n Maps to on Overleaf\n \n Kb/TITLE_SLUGhttps://www.overleaf.com/learn/how-to/TITLE_SLUG\n "
sanitizedToText: " \n \n \n \n MediaWiki page\n Maps to on Overleaf\n \n Kb/TITLE_SLUG<nowiki>https://www.overleaf.com/learn/how-to/</nowiki>TITLE"
```
Note the hidden/escaped `<nowiki>` element.
In addition to the side-by-side comparison of HTML you will see a plain-text diff.

View file

@ -0,0 +1,120 @@
const crypto = require('crypto')
const fs = require('fs')
const Path = require('path')
const cheerio = require('cheerio')
const prettier = require('prettier')
const sanitizeHtml = require('sanitize-html')
const {
sanitizeOptions,
} = require('../../../modules/learn/app/src/sanitizeOptions')
const EXTRACT_STYLE = process.env.EXTRACT_STYLES === 'true'
const OMIT_STYLE = process.env.OMIT_STYLE !== 'false'
const DUMP_CSS_IN = Path.join(
Path.dirname(Path.dirname(Path.dirname(__dirname))),
'data',
'dumpFolder'
)
function hash(blob) {
return crypto.createHash('sha1').update(blob).digest('hex')
}
function normalize(blob, title) {
// styles are dropped in web and kept in wiki pages for previewing there.
blob = blob.replace(/<style>(.+?)<\/style>/gs, (_, match) => {
if (EXTRACT_STYLE) {
// normalize css with prettier
const css = prettier.format(match, { parser: 'css' })
fs.writeFileSync(
Path.join(DUMP_CSS_IN, `${hash(css)}-${encodeURIComponent(title)}.css`),
`/* title: ${title} */\n\n${css}`
)
}
if (OMIT_STYLE) {
return ''
}
return match
})
// strip comments:
// - comment at the bottom of each page
blob = blob.replace(/<!-- \nNewPP limit report.+/s, '')
// - annotation of math characters
blob = blob.replace(/<!-- . -->/g, '')
// wrap for consistent rendering
if (blob.indexOf('<html><head>') !== 0) {
blob = `<html><head>${blob}</head></html>`
}
// normalize inline style:
// - drop trailing ;
blob = blob.replace(/style="([^"]+);"/g, (_, style) => `style="${style}"`)
// - normalize whitespace
blob = blob.replace(
/style="([^"]+)"/g,
(_, style) => `style="${style.trim().replace(/([:;])\s+/g, '$1')}"`
)
// let cherrio do another pass
return cheerio.load(blob).html()
}
function toText(blob) {
return cheerio.load(blob).text()
}
const zoomOut = 50
function peak(content, offset) {
// show some more content before/after the mismatch
if (offset > zoomOut) {
offset -= zoomOut
}
// wrap in JSON to escape new line characters
return JSON.stringify(content.slice(offset, offset + chunkSize + 2 * zoomOut))
}
const chunkSize = 100
function findFirstMismatch(a, b) {
if (a === b) return a.length
let i = 0
while (
a.length > chunkSize &&
b.length > chunkSize &&
a.slice(0, chunkSize) === b.slice(0, chunkSize)
) {
i++
a = a.slice(chunkSize)
b = b.slice(chunkSize)
}
return i * chunkSize
}
function checkSanitizeOptions(page, title, text) {
text = normalize(text, title)
const sanitized = normalize(sanitizeHtml(text, sanitizeOptions))
if (text === sanitized) return
const offset = findFirstMismatch(text, sanitized)
const textToText = toText(text)
const sanitizedToText = toText(sanitized)
const offsetText = findFirstMismatch(textToText, sanitizedToText)
console.error('---')
console.error('page :', page)
console.error('title :', title)
console.error('match :', text === sanitized)
console.error('toText :', toText(text) === toText(sanitized))
console.error('text :', peak(text, offset))
console.error('sanitized :', peak(sanitized, offset))
console.error('textToText :', peak(textToText, offsetText))
console.error('sanitizedToText:', peak(sanitizedToText, offsetText))
}
module.exports = {
checkSanitizeOptions,
}

View file

@ -0,0 +1,35 @@
const { checkSanitizeOptions } = require('./checkSanitizeOptions')
const { getAllPagesAndCache, scrapeAndCachePage } = require('./scrape')
async function main() {
const BASE_URL = process.argv.pop()
if (!BASE_URL.startsWith('http')) {
throw new Error(
'Usage: node scripts/learn/checkSanitize https://LEARN_WIKI'
)
}
const pages = await getAllPagesAndCache(BASE_URL)
for (const page of pages) {
try {
const parsed = await scrapeAndCachePage(BASE_URL, page)
const title = parsed.title
const text = parsed.text ? parsed.text['*'] : ''
checkSanitizeOptions(page, title, text)
} catch (e) {
console.error('---')
console.error(page, e)
throw e
}
}
}
if (require.main === module) {
main().catch(err => {
console.error(err)
process.exit(1)
})
}

View file

@ -0,0 +1,114 @@
const Path = require('path')
const fs = require('fs')
const fetch = require('node-fetch')
const CACHE_IN = Path.join(
Path.dirname(Path.dirname(Path.dirname(__dirname))),
'data',
'learnPages'
)
async function scrape(baseUrl, page) {
const uri = new URL(baseUrl + '/learn-scripts/api.php')
uri.search = new URLSearchParams({
page,
action: 'parse',
format: 'json',
redirects: true,
}).toString()
const response = await fetch(uri)
if (response.status !== 200) {
console.error(response.status, page, response)
}
return await response.text()
}
const crypto = require('crypto')
function hash(blob) {
return crypto.createHash('sha1').update(blob).digest('hex')
}
function getName(page) {
let enc = encodeURIComponent(page)
// There are VERY long titles in media wiki.
// Add percent encoding and they exceed the filename size on my Ubuntu box.
if (enc.length > 100) {
enc = enc.slice(0, 100) + hash(page)
}
return enc
}
async function scrapeAndCachePage(baseUrl, page) {
const path = Path.join(CACHE_IN, getName(page) + '.json')
try {
return JSON.parse(await fs.promises.readFile(path, 'utf-8'))
} catch (e) {
const blob = await scrape(baseUrl, page)
const parsed = JSON.parse(blob).parse
if (!parsed) {
console.error(page, blob)
throw new Error('bad contents')
}
await fs.promises.mkdir(CACHE_IN, { recursive: true })
await fs.promises.writeFile(path, JSON.stringify(parsed, null, 2), 'utf-8')
return parsed
}
}
async function getAllPagesFrom(baseUrl, continueFrom) {
// https://learn.overleaf.com/learn/Special:ApiSandbox#action=query&format=json&generator=allpages&gapfilterredir=nonredirects
const uri = new URL(baseUrl + '/learn-scripts/api.php')
uri.search = new URLSearchParams({
action: 'query',
format: 'json',
generator: 'allpages',
// Ignore pages with redirects. We do not want to check page content twice.
gapfilterredir: 'nonredirects',
// Bump the default page size of 10.
gaplimit: 100,
...continueFrom,
}).toString()
const response = await fetch(uri)
if (response.status !== 200) {
console.error(response.status, continueFrom, response)
}
const blob = await response.json()
const nextContinueFrom = blob && blob.continue
const pagesRaw = (blob && blob.query && blob.query.pages) || {}
const pages = Object.values(pagesRaw).map(page => page.title)
return { nextContinueFrom, pages }
}
async function getAllPages(baseUrl) {
let continueFrom = {}
let allPages = []
while (true) {
const { nextContinueFrom, pages } = await getAllPagesFrom(
baseUrl,
continueFrom
)
allPages = allPages.concat(pages)
if (!nextContinueFrom) break
continueFrom = nextContinueFrom
}
return allPages.sort()
}
async function getAllPagesAndCache(baseUrl) {
const path = Path.join(CACHE_IN, 'allPages.txt')
try {
return JSON.parse(await fs.promises.readFile(path, 'utf-8'))
} catch (e) {
const allPages = await getAllPages(baseUrl)
await fs.promises.mkdir(CACHE_IN, { recursive: true })
await fs.promises.writeFile(path, JSON.stringify(allPages), 'utf-8')
return allPages
}
}
module.exports = {
getAllPagesAndCache,
scrapeAndCachePage,
}