mirror of
https://github.com/overleaf/overleaf.git
synced 2024-12-27 22:31:42 +00:00
903c1110e0
[learn] sanitize the wiki content GitOrigin-RevId: c114bbd94479e926c0621953fe9c03f6d380a19d
114 lines
3.2 KiB
JavaScript
114 lines
3.2 KiB
JavaScript
const Path = require('path')
|
|
const fs = require('fs')
|
|
|
|
const fetch = require('node-fetch')
|
|
|
|
const CACHE_IN = Path.join(
|
|
Path.dirname(Path.dirname(Path.dirname(__dirname))),
|
|
'data',
|
|
'learnPages'
|
|
)
|
|
|
|
async function scrape(baseUrl, page) {
|
|
const uri = new URL(baseUrl + '/learn-scripts/api.php')
|
|
uri.search = new URLSearchParams({
|
|
page,
|
|
action: 'parse',
|
|
format: 'json',
|
|
redirects: true,
|
|
}).toString()
|
|
const response = await fetch(uri)
|
|
if (response.status !== 200) {
|
|
console.error(response.status, page, response)
|
|
}
|
|
return await response.text()
|
|
}
|
|
|
|
const crypto = require('crypto')
|
|
|
|
function hash(blob) {
|
|
return crypto.createHash('sha1').update(blob).digest('hex')
|
|
}
|
|
|
|
function getName(page) {
|
|
let enc = encodeURIComponent(page)
|
|
// There are VERY long titles in media wiki.
|
|
// Add percent encoding and they exceed the filename size on my Ubuntu box.
|
|
if (enc.length > 100) {
|
|
enc = enc.slice(0, 100) + hash(page)
|
|
}
|
|
return enc
|
|
}
|
|
|
|
async function scrapeAndCachePage(baseUrl, page) {
|
|
const path = Path.join(CACHE_IN, getName(page) + '.json')
|
|
try {
|
|
return JSON.parse(await fs.promises.readFile(path, 'utf-8'))
|
|
} catch (e) {
|
|
const blob = await scrape(baseUrl, page)
|
|
const parsed = JSON.parse(blob).parse
|
|
if (!parsed) {
|
|
console.error(page, blob)
|
|
throw new Error('bad contents')
|
|
}
|
|
await fs.promises.mkdir(CACHE_IN, { recursive: true })
|
|
await fs.promises.writeFile(path, JSON.stringify(parsed, null, 2), 'utf-8')
|
|
return parsed
|
|
}
|
|
}
|
|
|
|
async function getAllPagesFrom(baseUrl, continueFrom) {
|
|
// https://learn.overleaf.com/learn/Special:ApiSandbox#action=query&format=json&generator=allpages&gapfilterredir=nonredirects
|
|
const uri = new URL(baseUrl + '/learn-scripts/api.php')
|
|
uri.search = new URLSearchParams({
|
|
action: 'query',
|
|
format: 'json',
|
|
generator: 'allpages',
|
|
// Ignore pages with redirects. We do not want to check page content twice.
|
|
gapfilterredir: 'nonredirects',
|
|
// Bump the default page size of 10.
|
|
gaplimit: 100,
|
|
...continueFrom,
|
|
}).toString()
|
|
const response = await fetch(uri)
|
|
if (response.status !== 200) {
|
|
console.error(response.status, continueFrom, response)
|
|
}
|
|
const blob = await response.json()
|
|
const nextContinueFrom = blob && blob.continue
|
|
const pagesRaw = (blob && blob.query && blob.query.pages) || {}
|
|
const pages = Object.values(pagesRaw).map(page => page.title)
|
|
return { nextContinueFrom, pages }
|
|
}
|
|
|
|
async function getAllPages(baseUrl) {
|
|
let continueFrom = {}
|
|
let allPages = []
|
|
while (true) {
|
|
const { nextContinueFrom, pages } = await getAllPagesFrom(
|
|
baseUrl,
|
|
continueFrom
|
|
)
|
|
allPages = allPages.concat(pages)
|
|
if (!nextContinueFrom) break
|
|
continueFrom = nextContinueFrom
|
|
}
|
|
return allPages.sort()
|
|
}
|
|
|
|
async function getAllPagesAndCache(baseUrl) {
|
|
const path = Path.join(CACHE_IN, 'allPages.txt')
|
|
try {
|
|
return JSON.parse(await fs.promises.readFile(path, 'utf-8'))
|
|
} catch (e) {
|
|
const allPages = await getAllPages(baseUrl)
|
|
await fs.promises.mkdir(CACHE_IN, { recursive: true })
|
|
await fs.promises.writeFile(path, JSON.stringify(allPages), 'utf-8')
|
|
return allPages
|
|
}
|
|
}
|
|
|
|
module.exports = {
|
|
getAllPagesAndCache,
|
|
scrapeAndCachePage,
|
|
}
|