Merge pull request #3998 from overleaf/jpa-learn-sanitize

[learn] sanitize the wiki content GitOrigin-RevId: c114bbd94479e926c0621953fe9c03f6d380a19d
2024-11-21 20:47:08 -05:00 · 2021-05-07 14:16:12 +01:00 · 2021-05-07 14:16:12 +01:00 · 903c1110e0
commit 903c1110e0
parent 1f3b01486e
5 changed files with 342 additions and 5 deletions
--- a/services/web/frontend/stylesheets/app/wiki.less
+++ b/services/web/frontend/stylesheets/app/wiki.less
@ -14,11 +14,6 @@
    }
  }
  img {
    height: auto;
    max-width: 100%;
  }
  .page-header {
    a {
      font-size: 0.8em;
@ -235,4 +230,45 @@
      box-shadow: 0 2px 4px rgba(0, 0, 0, 0.35);
    }
  }
  /* Keep the below rules in sync with the wiki pages */
  img {
    height: auto;
    max-width: 100%;
  }
  img.add-vertical-space {
    padding-bottom: 20px;
    padding-top: 20px;
  }
  th.no-wrap {
    white-space: nowrap;
    text-align: left;
  }
  /* LATEX and TEX artwork */
  span.TEX {
    letter-spacing: -0.125em;
    padding-right: 0.5ex;
  }
  span.TEX span.E {
    position: relative;
    top: 0.5ex;
    padding-right: 0.1ex;
  }
  a span.TEX span.E {
    text-decoration: none;
  }
  span.LATEX span.A {
    position: relative;
    top: -0.5ex;
    left: -0.4em;
    font-size: 75%;
  }
  span.LATEX span.TEX {
    position: relative;
    left: -0.4em;
    margin-right: -0.5ex;
  }
 }
--- a/services/web/scripts/learn/checkSanitize/README.md
+++ b/services/web/scripts/learn/checkSanitize/README.md
@ -0,0 +1,32 @@
 # Usage
 ```
 node scripts/learn/checkSanitize https://LEARN_WIKI
 ```
 ## Bulk export
 There is a bulk export for media wiki pages, but it produces different
 html escaping compared to the regular parse API we use in web.
 The bulk export does not escape all the placeholder HTML-like elements,
 like `<project-id` or `<document goes here>`.
 ## Example output
 Here is how a missing tag gets flagged:
 ```
 ---
 page           : MediaWiki markup for the Overleaf support team
 title          : MediaWiki markup for the Overleaf support team
 match          : false
 toText         : false
 text           : "Overleaf</strong></td>\n            </tr>\n           <tr><td>Kb/<strong>TITLE_SLUG</strong></td><td><nowiki>https://www.overleaf.com/learn/how-to/</nowiki><strong>TITLE_SLUG</strong></td>\n           </"
 sanitized      : "Overleaf</strong></td>\n            </tr>\n           <tr><td>Kb/<strong>TITLE_SLUG</strong></td><td>&lt;nowiki&gt;https://www.overleaf.com/learn/how-to/&lt;/nowiki&gt;<strong>TITLE_SLUG</strong></td>\n "
 textToText     : "    \n        \n        \n            \n                MediaWiki page\n                Maps to on Overleaf\n            \n           Kb/TITLE_SLUGhttps://www.overleaf.com/learn/how-to/TITLE_SLUG\n           "
 sanitizedToText: "    \n        \n        \n            \n                MediaWiki page\n                Maps to on Overleaf\n            \n           Kb/TITLE_SLUG<nowiki>https://www.overleaf.com/learn/how-to/</nowiki>TITLE"
 ```
 Note the hidden/escaped `<nowiki>` element.
 In addition to the side-by-side comparison of HTML you will see a plain-text diff.
--- a/services/web/scripts/learn/checkSanitize/checkSanitizeOptions.js
+++ b/services/web/scripts/learn/checkSanitize/checkSanitizeOptions.js
@ -0,0 +1,120 @@
 const crypto = require('crypto')
 const fs = require('fs')
 const Path = require('path')
 const cheerio = require('cheerio')
 const prettier = require('prettier')
 const sanitizeHtml = require('sanitize-html')
 const {
  sanitizeOptions,
 } = require('../../../modules/learn/app/src/sanitizeOptions')
 const EXTRACT_STYLE = process.env.EXTRACT_STYLES === 'true'
 const OMIT_STYLE = process.env.OMIT_STYLE !== 'false'
 const DUMP_CSS_IN = Path.join(
  Path.dirname(Path.dirname(Path.dirname(__dirname))),
  'data',
  'dumpFolder'
 )
 function hash(blob) {
  return crypto.createHash('sha1').update(blob).digest('hex')
 }
 function normalize(blob, title) {
  // styles are dropped in web and kept in wiki pages for previewing there.
  blob = blob.replace(/<style>(.+?)<\/style>/gs, (_, match) => {
    if (EXTRACT_STYLE) {
      // normalize css with prettier
      const css = prettier.format(match, { parser: 'css' })
      fs.writeFileSync(
        Path.join(DUMP_CSS_IN, `${hash(css)}-${encodeURIComponent(title)}.css`),
        `/* title: ${title} */\n\n${css}`
      )
    }
    if (OMIT_STYLE) {
      return ''
    }
    return match
  })
  // strip comments:
  // - comment at the bottom of each page
  blob = blob.replace(/<!-- \nNewPP limit report.+/s, '')
  // - annotation of math characters
  blob = blob.replace(/<!-- . -->/g, '')
  // wrap for consistent rendering
  if (blob.indexOf('<html><head>') !== 0) {
    blob = `<html><head>${blob}</head></html>`
  }
  // normalize inline style:
  // - drop trailing ;
  blob = blob.replace(/style="([^"]+);"/g, (_, style) => `style="${style}"`)
  // - normalize whitespace
  blob = blob.replace(
    /style="([^"]+)"/g,
    (_, style) => `style="${style.trim().replace(/([:;])\s+/g, '$1')}"`
  )
  // let cherrio do another pass
  return cheerio.load(blob).html()
 }
 function toText(blob) {
  return cheerio.load(blob).text()
 }
 const zoomOut = 50
 function peak(content, offset) {
  // show some more content before/after the mismatch
  if (offset > zoomOut) {
    offset -= zoomOut
  }
  // wrap in JSON to escape new line characters
  return JSON.stringify(content.slice(offset, offset + chunkSize + 2 * zoomOut))
 }
 const chunkSize = 100
 function findFirstMismatch(a, b) {
  if (a === b) return a.length
  let i = 0
  while (
    a.length > chunkSize &&
    b.length > chunkSize &&
    a.slice(0, chunkSize) === b.slice(0, chunkSize)
  ) {
    i++
    a = a.slice(chunkSize)
    b = b.slice(chunkSize)
  }
  return i * chunkSize
 }
 function checkSanitizeOptions(page, title, text) {
  text = normalize(text, title)
  const sanitized = normalize(sanitizeHtml(text, sanitizeOptions))
  if (text === sanitized) return
  const offset = findFirstMismatch(text, sanitized)
  const textToText = toText(text)
  const sanitizedToText = toText(sanitized)
  const offsetText = findFirstMismatch(textToText, sanitizedToText)
  console.error('---')
  console.error('page           :', page)
  console.error('title          :', title)
  console.error('match          :', text === sanitized)
  console.error('toText         :', toText(text) === toText(sanitized))
  console.error('text           :', peak(text, offset))
  console.error('sanitized      :', peak(sanitized, offset))
  console.error('textToText     :', peak(textToText, offsetText))
  console.error('sanitizedToText:', peak(sanitizedToText, offsetText))
 }
 module.exports = {
  checkSanitizeOptions,
 }
--- a/services/web/scripts/learn/checkSanitize/index.js
+++ b/services/web/scripts/learn/checkSanitize/index.js
@ -0,0 +1,35 @@
 const { checkSanitizeOptions } = require('./checkSanitizeOptions')
 const { getAllPagesAndCache, scrapeAndCachePage } = require('./scrape')
 async function main() {
  const BASE_URL = process.argv.pop()
  if (!BASE_URL.startsWith('http')) {
    throw new Error(
      'Usage: node scripts/learn/checkSanitize https://LEARN_WIKI'
    )
  }
  const pages = await getAllPagesAndCache(BASE_URL)
  for (const page of pages) {
    try {
      const parsed = await scrapeAndCachePage(BASE_URL, page)
      const title = parsed.title
      const text = parsed.text ? parsed.text['*'] : ''
      checkSanitizeOptions(page, title, text)
    } catch (e) {
      console.error('---')
      console.error(page, e)
      throw e
    }
  }
 }
 if (require.main === module) {
  main().catch(err => {
    console.error(err)
    process.exit(1)
  })
 }
--- a/services/web/scripts/learn/checkSanitize/scrape.js
+++ b/services/web/scripts/learn/checkSanitize/scrape.js
@ -0,0 +1,114 @@
 const Path = require('path')
 const fs = require('fs')
 const fetch = require('node-fetch')
 const CACHE_IN = Path.join(
  Path.dirname(Path.dirname(Path.dirname(__dirname))),
  'data',
  'learnPages'
 )
 async function scrape(baseUrl, page) {
  const uri = new URL(baseUrl + '/learn-scripts/api.php')
  uri.search = new URLSearchParams({
    page,
    action: 'parse',
    format: 'json',
    redirects: true,
  }).toString()
  const response = await fetch(uri)
  if (response.status !== 200) {
    console.error(response.status, page, response)
  }
  return await response.text()
 }
 const crypto = require('crypto')
 function hash(blob) {
  return crypto.createHash('sha1').update(blob).digest('hex')
 }
 function getName(page) {
  let enc = encodeURIComponent(page)
  // There are VERY long titles in media wiki.
  // Add percent encoding and they exceed the filename size on my Ubuntu box.
  if (enc.length > 100) {
    enc = enc.slice(0, 100) + hash(page)
  }
  return enc
 }
 async function scrapeAndCachePage(baseUrl, page) {
  const path = Path.join(CACHE_IN, getName(page) + '.json')
  try {
    return JSON.parse(await fs.promises.readFile(path, 'utf-8'))
  } catch (e) {
    const blob = await scrape(baseUrl, page)
    const parsed = JSON.parse(blob).parse
    if (!parsed) {
      console.error(page, blob)
      throw new Error('bad contents')
    }
    await fs.promises.mkdir(CACHE_IN, { recursive: true })
    await fs.promises.writeFile(path, JSON.stringify(parsed, null, 2), 'utf-8')
    return parsed
  }
 }
 async function getAllPagesFrom(baseUrl, continueFrom) {
  // https://learn.overleaf.com/learn/Special:ApiSandbox#action=query&format=json&generator=allpages&gapfilterredir=nonredirects
  const uri = new URL(baseUrl + '/learn-scripts/api.php')
  uri.search = new URLSearchParams({
    action: 'query',
    format: 'json',
    generator: 'allpages',
    // Ignore pages with redirects. We do not want to check page content twice.
    gapfilterredir: 'nonredirects',
    // Bump the default page size of 10.
    gaplimit: 100,
    ...continueFrom,
  }).toString()
  const response = await fetch(uri)
  if (response.status !== 200) {
    console.error(response.status, continueFrom, response)
  }
  const blob = await response.json()
  const nextContinueFrom = blob && blob.continue
  const pagesRaw = (blob && blob.query && blob.query.pages) || {}
  const pages = Object.values(pagesRaw).map(page => page.title)
  return { nextContinueFrom, pages }
 }
 async function getAllPages(baseUrl) {
  let continueFrom = {}
  let allPages = []
  while (true) {
    const { nextContinueFrom, pages } = await getAllPagesFrom(
      baseUrl,
      continueFrom
    )
    allPages = allPages.concat(pages)
    if (!nextContinueFrom) break
    continueFrom = nextContinueFrom
  }
  return allPages.sort()
 }
 async function getAllPagesAndCache(baseUrl) {
  const path = Path.join(CACHE_IN, 'allPages.txt')
  try {
    return JSON.parse(await fs.promises.readFile(path, 'utf-8'))
  } catch (e) {
    const allPages = await getAllPages(baseUrl)
    await fs.promises.mkdir(CACHE_IN, { recursive: true })
    await fs.promises.writeFile(path, JSON.stringify(allPages), 'utf-8')
    return allPages
  }
 }
 module.exports = {
  getAllPagesAndCache,
  scrapeAndCachePage,
 }