mirror of
https://github.com/overleaf/overleaf.git
synced 2024-11-21 20:47:08 -05:00
Merge pull request #3998 from overleaf/jpa-learn-sanitize
[learn] sanitize the wiki content GitOrigin-RevId: c114bbd94479e926c0621953fe9c03f6d380a19d
This commit is contained in:
parent
1f3b01486e
commit
903c1110e0
5 changed files with 342 additions and 5 deletions
|
@ -14,11 +14,6 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
img {
|
|
||||||
height: auto;
|
|
||||||
max-width: 100%;
|
|
||||||
}
|
|
||||||
|
|
||||||
.page-header {
|
.page-header {
|
||||||
a {
|
a {
|
||||||
font-size: 0.8em;
|
font-size: 0.8em;
|
||||||
|
@ -235,4 +230,45 @@
|
||||||
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.35);
|
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.35);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Keep the below rules in sync with the wiki pages */
|
||||||
|
img {
|
||||||
|
height: auto;
|
||||||
|
max-width: 100%;
|
||||||
|
}
|
||||||
|
|
||||||
|
img.add-vertical-space {
|
||||||
|
padding-bottom: 20px;
|
||||||
|
padding-top: 20px;
|
||||||
|
}
|
||||||
|
|
||||||
|
th.no-wrap {
|
||||||
|
white-space: nowrap;
|
||||||
|
text-align: left;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* LATEX and TEX artwork */
|
||||||
|
span.TEX {
|
||||||
|
letter-spacing: -0.125em;
|
||||||
|
padding-right: 0.5ex;
|
||||||
|
}
|
||||||
|
span.TEX span.E {
|
||||||
|
position: relative;
|
||||||
|
top: 0.5ex;
|
||||||
|
padding-right: 0.1ex;
|
||||||
|
}
|
||||||
|
a span.TEX span.E {
|
||||||
|
text-decoration: none;
|
||||||
|
}
|
||||||
|
span.LATEX span.A {
|
||||||
|
position: relative;
|
||||||
|
top: -0.5ex;
|
||||||
|
left: -0.4em;
|
||||||
|
font-size: 75%;
|
||||||
|
}
|
||||||
|
span.LATEX span.TEX {
|
||||||
|
position: relative;
|
||||||
|
left: -0.4em;
|
||||||
|
margin-right: -0.5ex;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
32
services/web/scripts/learn/checkSanitize/README.md
Normal file
32
services/web/scripts/learn/checkSanitize/README.md
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
# Usage
|
||||||
|
|
||||||
|
```
|
||||||
|
node scripts/learn/checkSanitize https://LEARN_WIKI
|
||||||
|
```
|
||||||
|
|
||||||
|
## Bulk export
|
||||||
|
|
||||||
|
There is a bulk export for media wiki pages, but it produces different
|
||||||
|
html escaping compared to the regular parse API we use in web.
|
||||||
|
|
||||||
|
The bulk export does not escape all the placeholder HTML-like elements,
|
||||||
|
like `<project-id` or `<document goes here>`.
|
||||||
|
|
||||||
|
## Example output
|
||||||
|
|
||||||
|
Here is how a missing tag gets flagged:
|
||||||
|
|
||||||
|
```
|
||||||
|
---
|
||||||
|
page : MediaWiki markup for the Overleaf support team
|
||||||
|
title : MediaWiki markup for the Overleaf support team
|
||||||
|
match : false
|
||||||
|
toText : false
|
||||||
|
text : "Overleaf</strong></td>\n </tr>\n <tr><td>Kb/<strong>TITLE_SLUG</strong></td><td><nowiki>https://www.overleaf.com/learn/how-to/</nowiki><strong>TITLE_SLUG</strong></td>\n </"
|
||||||
|
sanitized : "Overleaf</strong></td>\n </tr>\n <tr><td>Kb/<strong>TITLE_SLUG</strong></td><td><nowiki>https://www.overleaf.com/learn/how-to/</nowiki><strong>TITLE_SLUG</strong></td>\n "
|
||||||
|
textToText : " \n \n \n \n MediaWiki page\n Maps to on Overleaf\n \n Kb/TITLE_SLUGhttps://www.overleaf.com/learn/how-to/TITLE_SLUG\n "
|
||||||
|
sanitizedToText: " \n \n \n \n MediaWiki page\n Maps to on Overleaf\n \n Kb/TITLE_SLUG<nowiki>https://www.overleaf.com/learn/how-to/</nowiki>TITLE"
|
||||||
|
```
|
||||||
|
|
||||||
|
Note the hidden/escaped `<nowiki>` element.
|
||||||
|
In addition to the side-by-side comparison of HTML you will see a plain-text diff.
|
120
services/web/scripts/learn/checkSanitize/checkSanitizeOptions.js
Normal file
120
services/web/scripts/learn/checkSanitize/checkSanitizeOptions.js
Normal file
|
@ -0,0 +1,120 @@
|
||||||
|
const crypto = require('crypto')
|
||||||
|
const fs = require('fs')
|
||||||
|
const Path = require('path')
|
||||||
|
|
||||||
|
const cheerio = require('cheerio')
|
||||||
|
const prettier = require('prettier')
|
||||||
|
const sanitizeHtml = require('sanitize-html')
|
||||||
|
|
||||||
|
const {
|
||||||
|
sanitizeOptions,
|
||||||
|
} = require('../../../modules/learn/app/src/sanitizeOptions')
|
||||||
|
|
||||||
|
const EXTRACT_STYLE = process.env.EXTRACT_STYLES === 'true'
|
||||||
|
const OMIT_STYLE = process.env.OMIT_STYLE !== 'false'
|
||||||
|
const DUMP_CSS_IN = Path.join(
|
||||||
|
Path.dirname(Path.dirname(Path.dirname(__dirname))),
|
||||||
|
'data',
|
||||||
|
'dumpFolder'
|
||||||
|
)
|
||||||
|
|
||||||
|
function hash(blob) {
|
||||||
|
return crypto.createHash('sha1').update(blob).digest('hex')
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalize(blob, title) {
|
||||||
|
// styles are dropped in web and kept in wiki pages for previewing there.
|
||||||
|
blob = blob.replace(/<style>(.+?)<\/style>/gs, (_, match) => {
|
||||||
|
if (EXTRACT_STYLE) {
|
||||||
|
// normalize css with prettier
|
||||||
|
const css = prettier.format(match, { parser: 'css' })
|
||||||
|
fs.writeFileSync(
|
||||||
|
Path.join(DUMP_CSS_IN, `${hash(css)}-${encodeURIComponent(title)}.css`),
|
||||||
|
`/* title: ${title} */\n\n${css}`
|
||||||
|
)
|
||||||
|
}
|
||||||
|
if (OMIT_STYLE) {
|
||||||
|
return ''
|
||||||
|
}
|
||||||
|
return match
|
||||||
|
})
|
||||||
|
|
||||||
|
// strip comments:
|
||||||
|
// - comment at the bottom of each page
|
||||||
|
blob = blob.replace(/<!-- \nNewPP limit report.+/s, '')
|
||||||
|
// - annotation of math characters
|
||||||
|
blob = blob.replace(/<!-- . -->/g, '')
|
||||||
|
|
||||||
|
// wrap for consistent rendering
|
||||||
|
if (blob.indexOf('<html><head>') !== 0) {
|
||||||
|
blob = `<html><head>${blob}</head></html>`
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalize inline style:
|
||||||
|
// - drop trailing ;
|
||||||
|
blob = blob.replace(/style="([^"]+);"/g, (_, style) => `style="${style}"`)
|
||||||
|
// - normalize whitespace
|
||||||
|
blob = blob.replace(
|
||||||
|
/style="([^"]+)"/g,
|
||||||
|
(_, style) => `style="${style.trim().replace(/([:;])\s+/g, '$1')}"`
|
||||||
|
)
|
||||||
|
|
||||||
|
// let cherrio do another pass
|
||||||
|
return cheerio.load(blob).html()
|
||||||
|
}
|
||||||
|
|
||||||
|
function toText(blob) {
|
||||||
|
return cheerio.load(blob).text()
|
||||||
|
}
|
||||||
|
|
||||||
|
const zoomOut = 50
|
||||||
|
function peak(content, offset) {
|
||||||
|
// show some more content before/after the mismatch
|
||||||
|
if (offset > zoomOut) {
|
||||||
|
offset -= zoomOut
|
||||||
|
}
|
||||||
|
// wrap in JSON to escape new line characters
|
||||||
|
return JSON.stringify(content.slice(offset, offset + chunkSize + 2 * zoomOut))
|
||||||
|
}
|
||||||
|
|
||||||
|
const chunkSize = 100
|
||||||
|
function findFirstMismatch(a, b) {
|
||||||
|
if (a === b) return a.length
|
||||||
|
let i = 0
|
||||||
|
while (
|
||||||
|
a.length > chunkSize &&
|
||||||
|
b.length > chunkSize &&
|
||||||
|
a.slice(0, chunkSize) === b.slice(0, chunkSize)
|
||||||
|
) {
|
||||||
|
i++
|
||||||
|
a = a.slice(chunkSize)
|
||||||
|
b = b.slice(chunkSize)
|
||||||
|
}
|
||||||
|
return i * chunkSize
|
||||||
|
}
|
||||||
|
|
||||||
|
function checkSanitizeOptions(page, title, text) {
|
||||||
|
text = normalize(text, title)
|
||||||
|
const sanitized = normalize(sanitizeHtml(text, sanitizeOptions))
|
||||||
|
if (text === sanitized) return
|
||||||
|
|
||||||
|
const offset = findFirstMismatch(text, sanitized)
|
||||||
|
|
||||||
|
const textToText = toText(text)
|
||||||
|
const sanitizedToText = toText(sanitized)
|
||||||
|
const offsetText = findFirstMismatch(textToText, sanitizedToText)
|
||||||
|
|
||||||
|
console.error('---')
|
||||||
|
console.error('page :', page)
|
||||||
|
console.error('title :', title)
|
||||||
|
console.error('match :', text === sanitized)
|
||||||
|
console.error('toText :', toText(text) === toText(sanitized))
|
||||||
|
console.error('text :', peak(text, offset))
|
||||||
|
console.error('sanitized :', peak(sanitized, offset))
|
||||||
|
console.error('textToText :', peak(textToText, offsetText))
|
||||||
|
console.error('sanitizedToText:', peak(sanitizedToText, offsetText))
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
checkSanitizeOptions,
|
||||||
|
}
|
35
services/web/scripts/learn/checkSanitize/index.js
Normal file
35
services/web/scripts/learn/checkSanitize/index.js
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
const { checkSanitizeOptions } = require('./checkSanitizeOptions')
|
||||||
|
const { getAllPagesAndCache, scrapeAndCachePage } = require('./scrape')
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const BASE_URL = process.argv.pop()
|
||||||
|
if (!BASE_URL.startsWith('http')) {
|
||||||
|
throw new Error(
|
||||||
|
'Usage: node scripts/learn/checkSanitize https://LEARN_WIKI'
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
const pages = await getAllPagesAndCache(BASE_URL)
|
||||||
|
|
||||||
|
for (const page of pages) {
|
||||||
|
try {
|
||||||
|
const parsed = await scrapeAndCachePage(BASE_URL, page)
|
||||||
|
|
||||||
|
const title = parsed.title
|
||||||
|
const text = parsed.text ? parsed.text['*'] : ''
|
||||||
|
|
||||||
|
checkSanitizeOptions(page, title, text)
|
||||||
|
} catch (e) {
|
||||||
|
console.error('---')
|
||||||
|
console.error(page, e)
|
||||||
|
throw e
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (require.main === module) {
|
||||||
|
main().catch(err => {
|
||||||
|
console.error(err)
|
||||||
|
process.exit(1)
|
||||||
|
})
|
||||||
|
}
|
114
services/web/scripts/learn/checkSanitize/scrape.js
Normal file
114
services/web/scripts/learn/checkSanitize/scrape.js
Normal file
|
@ -0,0 +1,114 @@
|
||||||
|
const Path = require('path')
|
||||||
|
const fs = require('fs')
|
||||||
|
|
||||||
|
const fetch = require('node-fetch')
|
||||||
|
|
||||||
|
const CACHE_IN = Path.join(
|
||||||
|
Path.dirname(Path.dirname(Path.dirname(__dirname))),
|
||||||
|
'data',
|
||||||
|
'learnPages'
|
||||||
|
)
|
||||||
|
|
||||||
|
async function scrape(baseUrl, page) {
|
||||||
|
const uri = new URL(baseUrl + '/learn-scripts/api.php')
|
||||||
|
uri.search = new URLSearchParams({
|
||||||
|
page,
|
||||||
|
action: 'parse',
|
||||||
|
format: 'json',
|
||||||
|
redirects: true,
|
||||||
|
}).toString()
|
||||||
|
const response = await fetch(uri)
|
||||||
|
if (response.status !== 200) {
|
||||||
|
console.error(response.status, page, response)
|
||||||
|
}
|
||||||
|
return await response.text()
|
||||||
|
}
|
||||||
|
|
||||||
|
const crypto = require('crypto')
|
||||||
|
|
||||||
|
function hash(blob) {
|
||||||
|
return crypto.createHash('sha1').update(blob).digest('hex')
|
||||||
|
}
|
||||||
|
|
||||||
|
function getName(page) {
|
||||||
|
let enc = encodeURIComponent(page)
|
||||||
|
// There are VERY long titles in media wiki.
|
||||||
|
// Add percent encoding and they exceed the filename size on my Ubuntu box.
|
||||||
|
if (enc.length > 100) {
|
||||||
|
enc = enc.slice(0, 100) + hash(page)
|
||||||
|
}
|
||||||
|
return enc
|
||||||
|
}
|
||||||
|
|
||||||
|
async function scrapeAndCachePage(baseUrl, page) {
|
||||||
|
const path = Path.join(CACHE_IN, getName(page) + '.json')
|
||||||
|
try {
|
||||||
|
return JSON.parse(await fs.promises.readFile(path, 'utf-8'))
|
||||||
|
} catch (e) {
|
||||||
|
const blob = await scrape(baseUrl, page)
|
||||||
|
const parsed = JSON.parse(blob).parse
|
||||||
|
if (!parsed) {
|
||||||
|
console.error(page, blob)
|
||||||
|
throw new Error('bad contents')
|
||||||
|
}
|
||||||
|
await fs.promises.mkdir(CACHE_IN, { recursive: true })
|
||||||
|
await fs.promises.writeFile(path, JSON.stringify(parsed, null, 2), 'utf-8')
|
||||||
|
return parsed
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function getAllPagesFrom(baseUrl, continueFrom) {
|
||||||
|
// https://learn.overleaf.com/learn/Special:ApiSandbox#action=query&format=json&generator=allpages&gapfilterredir=nonredirects
|
||||||
|
const uri = new URL(baseUrl + '/learn-scripts/api.php')
|
||||||
|
uri.search = new URLSearchParams({
|
||||||
|
action: 'query',
|
||||||
|
format: 'json',
|
||||||
|
generator: 'allpages',
|
||||||
|
// Ignore pages with redirects. We do not want to check page content twice.
|
||||||
|
gapfilterredir: 'nonredirects',
|
||||||
|
// Bump the default page size of 10.
|
||||||
|
gaplimit: 100,
|
||||||
|
...continueFrom,
|
||||||
|
}).toString()
|
||||||
|
const response = await fetch(uri)
|
||||||
|
if (response.status !== 200) {
|
||||||
|
console.error(response.status, continueFrom, response)
|
||||||
|
}
|
||||||
|
const blob = await response.json()
|
||||||
|
const nextContinueFrom = blob && blob.continue
|
||||||
|
const pagesRaw = (blob && blob.query && blob.query.pages) || {}
|
||||||
|
const pages = Object.values(pagesRaw).map(page => page.title)
|
||||||
|
return { nextContinueFrom, pages }
|
||||||
|
}
|
||||||
|
|
||||||
|
async function getAllPages(baseUrl) {
|
||||||
|
let continueFrom = {}
|
||||||
|
let allPages = []
|
||||||
|
while (true) {
|
||||||
|
const { nextContinueFrom, pages } = await getAllPagesFrom(
|
||||||
|
baseUrl,
|
||||||
|
continueFrom
|
||||||
|
)
|
||||||
|
allPages = allPages.concat(pages)
|
||||||
|
if (!nextContinueFrom) break
|
||||||
|
continueFrom = nextContinueFrom
|
||||||
|
}
|
||||||
|
return allPages.sort()
|
||||||
|
}
|
||||||
|
|
||||||
|
async function getAllPagesAndCache(baseUrl) {
|
||||||
|
const path = Path.join(CACHE_IN, 'allPages.txt')
|
||||||
|
try {
|
||||||
|
return JSON.parse(await fs.promises.readFile(path, 'utf-8'))
|
||||||
|
} catch (e) {
|
||||||
|
const allPages = await getAllPages(baseUrl)
|
||||||
|
await fs.promises.mkdir(CACHE_IN, { recursive: true })
|
||||||
|
await fs.promises.writeFile(path, JSON.stringify(allPages), 'utf-8')
|
||||||
|
return allPages
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
getAllPagesAndCache,
|
||||||
|
scrapeAndCachePage,
|
||||||
|
}
|
Loading…
Reference in a new issue