Merge pull request #3998 from overleaf/jpa-learn-sanitize

[learn] sanitize the wiki content GitOrigin-RevId: c114bbd94479e926c0621953fe9c03f6d380a19d
2025-04-14 16:33:29 +00:00 · 2021-05-07 14:16:12 +01:00 · 2021-05-07 14:16:12 +01:00 · 903c1110e0
commit 903c1110e0
parent 1f3b01486e
5 changed files with 342 additions and 5 deletions
--- a/services/web/frontend/stylesheets/app/wiki.less
+++ b/services/web/frontend/stylesheets/app/wiki.less
@ -14,11 +14,6 @@
    }
  }

-  img {
-    height: auto;
-    max-width: 100%;
-  }
-
  .page-header {
    a {
      font-size: 0.8em;
@ -235,4 +230,45 @@
      box-shadow: 0 2px 4px rgba(0, 0, 0, 0.35);
    }
  }
+
+  /* Keep the below rules in sync with the wiki pages */
+  img {
+    height: auto;
+    max-width: 100%;
+  }
+
+  img.add-vertical-space {
+    padding-bottom: 20px;
+    padding-top: 20px;
+  }
+
+  th.no-wrap {
+    white-space: nowrap;
+    text-align: left;
+  }
+
+  /* LATEX and TEX artwork */
+  span.TEX {
+    letter-spacing: -0.125em;
+    padding-right: 0.5ex;
+  }
+  span.TEX span.E {
+    position: relative;
+    top: 0.5ex;
+    padding-right: 0.1ex;
+  }
+  a span.TEX span.E {
+    text-decoration: none;
+  }
+  span.LATEX span.A {
+    position: relative;
+    top: -0.5ex;
+    left: -0.4em;
+    font-size: 75%;
+  }
+  span.LATEX span.TEX {
+    position: relative;
+    left: -0.4em;
+    margin-right: -0.5ex;
+  }
 }
--- a/services/web/scripts/learn/checkSanitize/README.md
+++ b/services/web/scripts/learn/checkSanitize/README.md
@ -0,0 +1,32 @@
+# Usage
+
+```
+node scripts/learn/checkSanitize https://LEARN_WIKI
+```
+
+## Bulk export
+
+There is a bulk export for media wiki pages, but it produces different
+ html escaping compared to the regular parse API we use in web.
+
+The bulk export does not escape all the placeholder HTML-like elements,
+ like `<project-id` or `<document goes here>`.
+
+## Example output
+
+Here is how a missing tag gets flagged:
+
+```
+---
+page           : MediaWiki markup for the Overleaf support team
+title          : MediaWiki markup for the Overleaf support team
+match          : false
+toText         : false
+text           : "Overleaf</strong></td>\n            </tr>\n           <tr><td>Kb/<strong>TITLE_SLUG</strong></td><td><nowiki>https://www.overleaf.com/learn/how-to/</nowiki><strong>TITLE_SLUG</strong></td>\n           </"
+sanitized      : "Overleaf</strong></td>\n            </tr>\n           <tr><td>Kb/<strong>TITLE_SLUG</strong></td><td>&lt;nowiki&gt;https://www.overleaf.com/learn/how-to/&lt;/nowiki&gt;<strong>TITLE_SLUG</strong></td>\n "
+textToText     : "    \n        \n        \n            \n                MediaWiki page\n                Maps to on Overleaf\n            \n           Kb/TITLE_SLUGhttps://www.overleaf.com/learn/how-to/TITLE_SLUG\n           "
+sanitizedToText: "    \n        \n        \n            \n                MediaWiki page\n                Maps to on Overleaf\n            \n           Kb/TITLE_SLUG<nowiki>https://www.overleaf.com/learn/how-to/</nowiki>TITLE"
+```
+
+Note the hidden/escaped `<nowiki>` element.
+In addition to the side-by-side comparison of HTML you will see a plain-text diff.
--- a/services/web/scripts/learn/checkSanitize/checkSanitizeOptions.js
+++ b/services/web/scripts/learn/checkSanitize/checkSanitizeOptions.js
@ -0,0 +1,120 @@
+const crypto = require('crypto')
+const fs = require('fs')
+const Path = require('path')
+
+const cheerio = require('cheerio')
+const prettier = require('prettier')
+const sanitizeHtml = require('sanitize-html')
+
+const {
+  sanitizeOptions,
+} = require('../../../modules/learn/app/src/sanitizeOptions')
+
+const EXTRACT_STYLE = process.env.EXTRACT_STYLES === 'true'
+const OMIT_STYLE = process.env.OMIT_STYLE !== 'false'
+const DUMP_CSS_IN = Path.join(
+  Path.dirname(Path.dirname(Path.dirname(__dirname))),
+  'data',
+  'dumpFolder'
+)
+
+function hash(blob) {
+  return crypto.createHash('sha1').update(blob).digest('hex')
+}
+
+function normalize(blob, title) {
+  // styles are dropped in web and kept in wiki pages for previewing there.
+  blob = blob.replace(/<style>(.+?)<\/style>/gs, (_, match) => {
+    if (EXTRACT_STYLE) {
+      // normalize css with prettier
+      const css = prettier.format(match, { parser: 'css' })
+      fs.writeFileSync(
+        Path.join(DUMP_CSS_IN, `${hash(css)}-${encodeURIComponent(title)}.css`),
+        `/* title: ${title} */\n\n${css}`
+      )
+    }
+    if (OMIT_STYLE) {
+      return ''
+    }
+    return match
+  })
+
+  // strip comments:
+  // - comment at the bottom of each page
+  blob = blob.replace(/<!-- \nNewPP limit report.+/s, '')
+  // - annotation of math characters
+  blob = blob.replace(/<!-- . -->/g, '')
+
+  // wrap for consistent rendering
+  if (blob.indexOf('<html><head>') !== 0) {
+    blob = `<html><head>${blob}</head></html>`
+  }
+
+  // normalize inline style:
+  // - drop trailing ;
+  blob = blob.replace(/style="([^"]+);"/g, (_, style) => `style="${style}"`)
+  // - normalize whitespace
+  blob = blob.replace(
+    /style="([^"]+)"/g,
+    (_, style) => `style="${style.trim().replace(/([:;])\s+/g, '$1')}"`
+  )
+
+  // let cherrio do another pass
+  return cheerio.load(blob).html()
+}
+
+function toText(blob) {
+  return cheerio.load(blob).text()
+}
+
+const zoomOut = 50
+function peak(content, offset) {
+  // show some more content before/after the mismatch
+  if (offset > zoomOut) {
+    offset -= zoomOut
+  }
+  // wrap in JSON to escape new line characters
+  return JSON.stringify(content.slice(offset, offset + chunkSize + 2 * zoomOut))
+}
+
+const chunkSize = 100
+function findFirstMismatch(a, b) {
+  if (a === b) return a.length
+  let i = 0
+  while (
+    a.length > chunkSize &&
+    b.length > chunkSize &&
+    a.slice(0, chunkSize) === b.slice(0, chunkSize)
+  ) {
+    i++
+    a = a.slice(chunkSize)
+    b = b.slice(chunkSize)
+  }
+  return i * chunkSize
+}
+
+function checkSanitizeOptions(page, title, text) {
+  text = normalize(text, title)
+  const sanitized = normalize(sanitizeHtml(text, sanitizeOptions))
+  if (text === sanitized) return
+
+  const offset = findFirstMismatch(text, sanitized)
+
+  const textToText = toText(text)
+  const sanitizedToText = toText(sanitized)
+  const offsetText = findFirstMismatch(textToText, sanitizedToText)
+
+  console.error('---')
+  console.error('page           :', page)
+  console.error('title          :', title)
+  console.error('match          :', text === sanitized)
+  console.error('toText         :', toText(text) === toText(sanitized))
+  console.error('text           :', peak(text, offset))
+  console.error('sanitized      :', peak(sanitized, offset))
+  console.error('textToText     :', peak(textToText, offsetText))
+  console.error('sanitizedToText:', peak(sanitizedToText, offsetText))
+}
+
+module.exports = {
+  checkSanitizeOptions,
+}
--- a/services/web/scripts/learn/checkSanitize/index.js
+++ b/services/web/scripts/learn/checkSanitize/index.js
@ -0,0 +1,35 @@
+const { checkSanitizeOptions } = require('./checkSanitizeOptions')
+const { getAllPagesAndCache, scrapeAndCachePage } = require('./scrape')
+
+async function main() {
+  const BASE_URL = process.argv.pop()
+  if (!BASE_URL.startsWith('http')) {
+    throw new Error(
+      'Usage: node scripts/learn/checkSanitize https://LEARN_WIKI'
+    )
+  }
+
+  const pages = await getAllPagesAndCache(BASE_URL)
+
+  for (const page of pages) {
+    try {
+      const parsed = await scrapeAndCachePage(BASE_URL, page)
+
+      const title = parsed.title
+      const text = parsed.text ? parsed.text['*'] : ''
+
+      checkSanitizeOptions(page, title, text)
+    } catch (e) {
+      console.error('---')
+      console.error(page, e)
+      throw e
+    }
+  }
+}
+
+if (require.main === module) {
+  main().catch(err => {
+    console.error(err)
+    process.exit(1)
+  })
+}
--- a/services/web/scripts/learn/checkSanitize/scrape.js
+++ b/services/web/scripts/learn/checkSanitize/scrape.js
@ -0,0 +1,114 @@
+const Path = require('path')
+const fs = require('fs')
+
+const fetch = require('node-fetch')
+
+const CACHE_IN = Path.join(
+  Path.dirname(Path.dirname(Path.dirname(__dirname))),
+  'data',
+  'learnPages'
+)
+
+async function scrape(baseUrl, page) {
+  const uri = new URL(baseUrl + '/learn-scripts/api.php')
+  uri.search = new URLSearchParams({
+    page,
+    action: 'parse',
+    format: 'json',
+    redirects: true,
+  }).toString()
+  const response = await fetch(uri)
+  if (response.status !== 200) {
+    console.error(response.status, page, response)
+  }
+  return await response.text()
+}
+
+const crypto = require('crypto')
+
+function hash(blob) {
+  return crypto.createHash('sha1').update(blob).digest('hex')
+}
+
+function getName(page) {
+  let enc = encodeURIComponent(page)
+  // There are VERY long titles in media wiki.
+  // Add percent encoding and they exceed the filename size on my Ubuntu box.
+  if (enc.length > 100) {
+    enc = enc.slice(0, 100) + hash(page)
+  }
+  return enc
+}
+
+async function scrapeAndCachePage(baseUrl, page) {
+  const path = Path.join(CACHE_IN, getName(page) + '.json')
+  try {
+    return JSON.parse(await fs.promises.readFile(path, 'utf-8'))
+  } catch (e) {
+    const blob = await scrape(baseUrl, page)
+    const parsed = JSON.parse(blob).parse
+    if (!parsed) {
+      console.error(page, blob)
+      throw new Error('bad contents')
+    }
+    await fs.promises.mkdir(CACHE_IN, { recursive: true })
+    await fs.promises.writeFile(path, JSON.stringify(parsed, null, 2), 'utf-8')
+    return parsed
+  }
+}
+
+async function getAllPagesFrom(baseUrl, continueFrom) {
+  // https://learn.overleaf.com/learn/Special:ApiSandbox#action=query&format=json&generator=allpages&gapfilterredir=nonredirects
+  const uri = new URL(baseUrl + '/learn-scripts/api.php')
+  uri.search = new URLSearchParams({
+    action: 'query',
+    format: 'json',
+    generator: 'allpages',
+    // Ignore pages with redirects. We do not want to check page content twice.
+    gapfilterredir: 'nonredirects',
+    // Bump the default page size of 10.
+    gaplimit: 100,
+    ...continueFrom,
+  }).toString()
+  const response = await fetch(uri)
+  if (response.status !== 200) {
+    console.error(response.status, continueFrom, response)
+  }
+  const blob = await response.json()
+  const nextContinueFrom = blob && blob.continue
+  const pagesRaw = (blob && blob.query && blob.query.pages) || {}
+  const pages = Object.values(pagesRaw).map(page => page.title)
+  return { nextContinueFrom, pages }
+}
+
+async function getAllPages(baseUrl) {
+  let continueFrom = {}
+  let allPages = []
+  while (true) {
+    const { nextContinueFrom, pages } = await getAllPagesFrom(
+      baseUrl,
+      continueFrom
+    )
+    allPages = allPages.concat(pages)
+    if (!nextContinueFrom) break
+    continueFrom = nextContinueFrom
+  }
+  return allPages.sort()
+}
+
+async function getAllPagesAndCache(baseUrl) {
+  const path = Path.join(CACHE_IN, 'allPages.txt')
+  try {
+    return JSON.parse(await fs.promises.readFile(path, 'utf-8'))
+  } catch (e) {
+    const allPages = await getAllPages(baseUrl)
+    await fs.promises.mkdir(CACHE_IN, { recursive: true })
+    await fs.promises.writeFile(path, JSON.stringify(allPages), 'utf-8')
+    return allPages
+  }
+}
+
+module.exports = {
+  getAllPagesAndCache,
+  scrapeAndCachePage,
+}