mirror of
https://github.com/overleaf/overleaf.git
synced 2024-11-29 16:53:42 -05:00
28e87a9314
Use fetch-utils in web GitOrigin-RevId: cbd0298200bbe42567c6e94934bfb5114fa9b66f
130 lines
3.4 KiB
JavaScript
130 lines
3.4 KiB
JavaScript
const Path = require('path')
|
|
const fs = require('fs')
|
|
|
|
const {
|
|
fetchString,
|
|
fetchJson,
|
|
RequestFailedError,
|
|
} = require('@overleaf/fetch-utils')
|
|
|
|
const CACHE_IN = Path.join(
|
|
Path.dirname(Path.dirname(Path.dirname(__dirname))),
|
|
'data',
|
|
'learnPages'
|
|
)
|
|
|
|
async function scrape(baseUrl, page) {
|
|
const uri = new URL(baseUrl + '/learn-scripts/api.php')
|
|
uri.search = new URLSearchParams({
|
|
page,
|
|
action: 'parse',
|
|
format: 'json',
|
|
redirects: true,
|
|
}).toString()
|
|
|
|
try {
|
|
return await fetchString(uri)
|
|
} catch (err) {
|
|
if (err instanceof RequestFailedError) {
|
|
console.error(err.response.status, page, err.response)
|
|
} else {
|
|
console.error(err)
|
|
}
|
|
}
|
|
}
|
|
|
|
const crypto = require('crypto')
|
|
|
|
function hash(blob) {
|
|
return crypto.createHash('sha1').update(blob).digest('hex')
|
|
}
|
|
|
|
function getName(page) {
|
|
let enc = encodeURIComponent(page)
|
|
// There are VERY long titles in media wiki.
|
|
// Add percent encoding and they exceed the filename size on my Ubuntu box.
|
|
if (enc.length > 100) {
|
|
enc = enc.slice(0, 100) + hash(page)
|
|
}
|
|
return enc
|
|
}
|
|
|
|
async function scrapeAndCachePage(baseUrl, page) {
|
|
const path = Path.join(CACHE_IN, getName(page) + '.json')
|
|
try {
|
|
return JSON.parse(await fs.promises.readFile(path, 'utf-8'))
|
|
} catch (e) {
|
|
const blob = await scrape(baseUrl, page)
|
|
const parsed = JSON.parse(blob).parse
|
|
if (!parsed) {
|
|
console.error(page, blob)
|
|
throw new Error('bad contents')
|
|
}
|
|
await fs.promises.mkdir(CACHE_IN, { recursive: true })
|
|
await fs.promises.writeFile(path, JSON.stringify(parsed, null, 2), 'utf-8')
|
|
return parsed
|
|
}
|
|
}
|
|
|
|
async function getAllPagesFrom(baseUrl, continueFrom) {
|
|
// https://learn.overleaf.com/learn/Special:ApiSandbox#action=query&format=json&generator=allpages&gapfilterredir=nonredirects
|
|
const uri = new URL(baseUrl + '/learn-scripts/api.php')
|
|
uri.search = new URLSearchParams({
|
|
action: 'query',
|
|
format: 'json',
|
|
generator: 'allpages',
|
|
// Ignore pages with redirects. We do not want to check page content twice.
|
|
gapfilterredir: 'nonredirects',
|
|
// Bump the default page size of 10.
|
|
gaplimit: 100,
|
|
...continueFrom,
|
|
}).toString()
|
|
|
|
let blob
|
|
try {
|
|
blob = await fetchJson(uri)
|
|
} catch (err) {
|
|
if (err instanceof RequestFailedError) {
|
|
console.error(err.response.status, continueFrom, err.response)
|
|
} else {
|
|
console.error(err)
|
|
throw err
|
|
}
|
|
}
|
|
const nextContinueFrom = blob && blob.continue
|
|
const pagesRaw = (blob && blob.query && blob.query.pages) || {}
|
|
const pages = Object.values(pagesRaw).map(page => page.title)
|
|
return { nextContinueFrom, pages }
|
|
}
|
|
|
|
async function getAllPages(baseUrl) {
|
|
let continueFrom = {}
|
|
let allPages = []
|
|
while (true) {
|
|
const { nextContinueFrom, pages } = await getAllPagesFrom(
|
|
baseUrl,
|
|
continueFrom
|
|
)
|
|
allPages = allPages.concat(pages)
|
|
if (!nextContinueFrom) break
|
|
continueFrom = nextContinueFrom
|
|
}
|
|
return allPages.sort()
|
|
}
|
|
|
|
async function getAllPagesAndCache(baseUrl) {
|
|
const path = Path.join(CACHE_IN, 'allPages.txt')
|
|
try {
|
|
return JSON.parse(await fs.promises.readFile(path, 'utf-8'))
|
|
} catch (e) {
|
|
const allPages = await getAllPages(baseUrl)
|
|
await fs.promises.mkdir(CACHE_IN, { recursive: true })
|
|
await fs.promises.writeFile(path, JSON.stringify(allPages), 'utf-8')
|
|
return allPages
|
|
}
|
|
}
|
|
|
|
module.exports = {
|
|
getAllPagesAndCache,
|
|
scrapeAndCachePage,
|
|
}
|