[misc] merge pdf caching into main (#226)

* wip generate directory for hash content

* cleanup, remove console logging

* add content caching module

* Return PDF stream ranges with compile response

* Return the PDF file size in the compile response

* PDF range endpoint

* [misc] WIP: pdf caching: preserve the m-time on static content files

* [misc] WIP: pdf caching: improve browser caching, emit caching headers

* [misc] WIP: pdf caching: do not emit very small chunks <1kB

* [misc] keep up with moving output files into a separate directory

* [OutputCacheManager] add global feature flag for enabling pdf caching

* [misc] add contentId into the URL for protecting PDF stream contents

* [misc] support PDF stream caching for anonymous users

* [misc] add per-request feature flag for enabling PDF stream caching

* [misc] enable pdf caching in CI and emit metrics at the end of run

* [misc] expose compile stats and timings to the frontend

* [misc] log an error in case saving output files fails

* [misc] add metrics for pdf bandwidth and pdf caching performance

* [misc] add a dark mode to the pdf caching for computing ranges only

* [misc] move pdf caching metrics into ContentCacheMetrics

* [misc] add a config option for the min chunk size of pdf ranges

Co-authored-by: Brian Gough <brian.gough@overleaf.com>
Co-authored-by: Eric Mc Sween <eric.mcsween@overleaf.com>
This commit is contained in:
Jakob Ackermann 2021-05-13 15:07:54 +02:00 committed by GitHub
parent b3ed820444
commit b456ea726d
15 changed files with 487 additions and 27 deletions

View file

@ -10,6 +10,7 @@ const Metrics = require('@overleaf/metrics')
Metrics.initialize('clsi')
const CompileController = require('./app/js/CompileController')
const ContentController = require('./app/js/ContentController')
const Settings = require('settings-sharelatex')
const logger = require('logger-sharelatex')
logger.initialize('clsi')
@ -170,6 +171,15 @@ app.get(
}
)
app.get(
'/project/:projectId/content/:contentId/:hash',
ContentController.getPdfRange
)
app.get(
'/project/:projectId/user/:userId/content/:contentId/:hash',
ContentController.getPdfRange
)
app.get('/project/:project_id/build/:build_id/output/*', function (
req,
res,

View file

@ -49,7 +49,9 @@ module.exports = CompileController = {
}
return CompileManager.doCompileWithLock(request, function (
error,
outputFiles
outputFiles,
stats,
timings
) {
let code, status
if (outputFiles == null) {
@ -118,18 +120,30 @@ module.exports = CompileController = {
compile: {
status,
error: (error != null ? error.message : undefined) || error,
outputFiles: outputFiles.map((file) => ({
url:
`${Settings.apis.clsi.url}/project/${request.project_id}` +
(request.user_id != null
? `/user/${request.user_id}`
: '') +
(file.build != null ? `/build/${file.build}` : '') +
`/output/${file.path}`,
path: file.path,
type: file.type,
build: file.build
}))
stats,
timings,
outputFiles: outputFiles.map((file) => {
const record = {
url:
`${Settings.apis.clsi.url}/project/${request.project_id}` +
(request.user_id != null
? `/user/${request.user_id}`
: '') +
(file.build != null ? `/build/${file.build}` : '') +
`/output/${file.path}`,
path: file.path,
type: file.type,
build: file.build,
contentId: file.contentId
}
if (file.ranges != null) {
record.ranges = file.ranges
}
if (file.size != null) {
record.size = file.size
}
return record
})
}
})
})

View file

@ -34,6 +34,7 @@ const os = require('os')
const async = require('async')
const Errors = require('./Errors')
const CommandRunner = require('./CommandRunner')
const { emitPdfStats } = require('./ContentCacheMetrics')
const getCompileName = function (project_id, user_id) {
if (user_id != null) {
@ -77,6 +78,7 @@ module.exports = CompileManager = {
const compileDir = getCompileDir(request.project_id, request.user_id)
const outputDir = getOutputDir(request.project_id, request.user_id)
const timerE2E = new Metrics.Timer('compile-e2e')
let timer = new Metrics.Timer('write-to-disk')
logger.log(
{ project_id: request.project_id, user_id: request.user_id },
@ -249,11 +251,13 @@ module.exports = CompileManager = {
return callback(error)
}
Metrics.inc('compiles-succeeded')
stats = stats || {}
const object = stats || {}
for (metric_key in object) {
metric_value = object[metric_key]
Metrics.count(metric_key, metric_value)
}
timings = timings || {}
const object1 = timings || {}
for (metric_key in object1) {
metric_value = object1[metric_key]
@ -297,10 +301,32 @@ module.exports = CompileManager = {
return callback(error)
}
return OutputCacheManager.saveOutputFiles(
{ request, stats, timings },
outputFiles,
compileDir,
outputDir,
(error, newOutputFiles) => callback(null, newOutputFiles)
(err, newOutputFiles) => {
if (err) {
const {
project_id: projectId,
user_id: userId
} = request
logger.err(
{ projectId, userId, err },
'failed to save output files'
)
}
// Emit compile time.
timings.compile = ts
timings.compileE2E = timerE2E.done()
if (stats['pdf-size']) {
emitPdfStats(stats, timings)
}
callback(null, newOutputFiles, stats, timings)
}
)
}
)

View file

@ -0,0 +1,118 @@
/**
* ContentCacheManager - maintains a cache of stream hashes from a PDF file
*/
const { callbackify } = require('util')
const fs = require('fs')
const crypto = require('crypto')
const Path = require('path')
const Settings = require('settings-sharelatex')
const MIN_CHUNK_SIZE = Settings.pdfCachingMinChunkSize
/**
*
* @param {String} contentDir path to directory where content hash files are cached
* @param {String} filePath the pdf file to scan for streams
*/
async function update(contentDir, filePath) {
const stream = fs.createReadStream(filePath)
const extractor = new PdfStreamsExtractor()
const ranges = []
const newRanges = []
for await (const chunk of stream) {
const pdfStreams = extractor.consume(chunk)
for (const pdfStream of pdfStreams) {
if (pdfStream.end - pdfStream.start < MIN_CHUNK_SIZE) continue
const hash = pdfStreamHash(pdfStream.buffers)
const range = { start: pdfStream.start, end: pdfStream.end, hash }
ranges.push(range)
if (await writePdfStream(contentDir, hash, pdfStream.buffers)) {
newRanges.push(range)
}
}
}
return [ranges, newRanges]
}
class PdfStreamsExtractor {
constructor() {
this.fileIndex = 0
this.inStream = false
this.streamStartIndex = 0
this.buffers = []
}
consume(chunk) {
let chunkIndex = 0
const pdfStreams = []
while (true) {
if (!this.inStream) {
// Not in a stream, look for stream start
const index = chunk.indexOf('stream', chunkIndex)
if (index === -1) {
// Couldn't find stream start
break
}
// Found stream start, start a stream
this.inStream = true
this.streamStartIndex = this.fileIndex + index
chunkIndex = index
} else {
// In a stream, look for stream end
const index = chunk.indexOf('endstream', chunkIndex)
if (index === -1) {
this.buffers.push(chunk.slice(chunkIndex))
break
}
// add "endstream" part
const endIndex = index + 9
this.buffers.push(chunk.slice(chunkIndex, endIndex))
pdfStreams.push({
start: this.streamStartIndex,
end: this.fileIndex + endIndex,
buffers: this.buffers
})
this.inStream = false
this.buffers = []
chunkIndex = endIndex
}
}
this.fileIndex += chunk.length
return pdfStreams
}
}
function pdfStreamHash(buffers) {
const hash = crypto.createHash('sha256')
for (const buffer of buffers) {
hash.update(buffer)
}
return hash.digest('hex')
}
async function writePdfStream(dir, hash, buffers) {
const filename = Path.join(dir, hash)
try {
await fs.promises.stat(filename)
// The file exists. Do not rewrite the content.
// It would change the modified-time of the file and hence invalidate the
// ETags used for client side caching via browser internals.
return false
} catch (e) {}
const file = await fs.promises.open(filename, 'w')
if (Settings.enablePdfCachingDark) {
// Write an empty file in dark mode.
buffers = []
}
try {
for (const buffer of buffers) {
await file.write(buffer)
}
} finally {
await file.close()
}
return true
}
module.exports = { update: callbackify(update) }

View file

@ -0,0 +1,80 @@
const Metrics = require('./Metrics')
const ONE_MB = 1024 * 1024
function emitPdfStats(stats, timings) {
if (timings['compute-pdf-caching']) {
emitPdfCachingStats(stats, timings)
} else {
// How much bandwidth will the pdf incur when downloaded in full?
Metrics.summary('pdf-bandwidth', stats['pdf-size'])
}
}
function emitPdfCachingStats(stats, timings) {
if (!stats['pdf-size']) return // double check
// How large is the overhead of hashing up-front?
const fraction =
timings.compileE2E - timings['compute-pdf-caching'] !== 0
? timings.compileE2E /
(timings.compileE2E - timings['compute-pdf-caching'])
: 1
Metrics.summary('overhead-compute-pdf-ranges', fraction * 100 - 100)
// How does the hashing scale to pdf size in MB?
Metrics.timing(
'compute-pdf-caching-relative-to-pdf-size',
timings['compute-pdf-caching'] / (stats['pdf-size'] / ONE_MB)
)
if (stats['pdf-caching-total-ranges-size']) {
// How does the hashing scale to total ranges size in MB?
Metrics.timing(
'compute-pdf-caching-relative-to-total-ranges-size',
timings['compute-pdf-caching'] /
(stats['pdf-caching-total-ranges-size'] / ONE_MB)
)
// How fast is the hashing per range on average?
Metrics.timing(
'compute-pdf-caching-relative-to-ranges-count',
timings['compute-pdf-caching'] / stats['pdf-caching-n-ranges']
)
// How many ranges are new?
Metrics.summary(
'new-pdf-ranges-relative-to-total-ranges',
(stats['pdf-caching-n-new-ranges'] / stats['pdf-caching-n-ranges']) * 100
)
}
// How much content is cacheable?
Metrics.summary(
'cacheable-ranges-to-pdf-size',
(stats['pdf-caching-total-ranges-size'] / stats['pdf-size']) * 100
)
const sizeWhenDownloadedInFull =
// All of the pdf
stats['pdf-size'] -
// These ranges are potentially cached.
stats['pdf-caching-total-ranges-size'] +
// These ranges are not cached.
stats['pdf-caching-new-ranges-size']
// How much bandwidth can we save when downloading the pdf in full?
Metrics.summary(
'pdf-bandwidth-savings',
100 - (sizeWhenDownloadedInFull / stats['pdf-size']) * 100
)
// How much bandwidth will the pdf incur when downloaded in full?
Metrics.summary('pdf-bandwidth', sizeWhenDownloadedInFull)
// How much space do the ranges use?
// This will accumulate the ranges size over time, skipping already written ranges.
Metrics.summary('pdf-ranges-disk-size', stats['pdf-caching-new-ranges-size'])
}
module.exports = {
emitPdfStats
}

View file

@ -0,0 +1,24 @@
const Path = require('path')
const send = require('send')
const Settings = require('settings-sharelatex')
const OutputCacheManager = require('./OutputCacheManager')
const ONE_DAY_S = 24 * 60 * 60
const ONE_DAY_MS = ONE_DAY_S * 1000
function getPdfRange(req, res, next) {
const { projectId, userId, contentId, hash } = req.params
const perUserDir = userId ? `${projectId}-${userId}` : projectId
const path = Path.join(
Settings.path.outputDir,
perUserDir,
OutputCacheManager.CONTENT_SUBDIR,
contentId,
hash
)
res.setHeader('cache-control', `public, max-age=${ONE_DAY_S}`)
res.setHeader('expires', new Date(Date.now() + ONE_DAY_MS).toUTCString())
send(req, path).pipe(res)
}
module.exports = { getPdfRange }

View file

@ -22,10 +22,13 @@ const logger = require('logger-sharelatex')
const _ = require('lodash')
const Settings = require('settings-sharelatex')
const crypto = require('crypto')
const Metrics = require('./Metrics')
const OutputFileOptimiser = require('./OutputFileOptimiser')
const ContentCacheManager = require('./ContentCacheManager')
module.exports = OutputCacheManager = {
CONTENT_SUBDIR: 'content',
CACHE_SUBDIR: 'generated-files',
ARCHIVE_SUBDIR: 'archived-logs',
// build id is HEXDATE-HEXRANDOM from Date.now()and RandomBytes
@ -59,7 +62,13 @@ module.exports = OutputCacheManager = {
})
},
saveOutputFiles(outputFiles, compileDir, outputDir, callback) {
saveOutputFiles(
{ request, stats, timings },
outputFiles,
compileDir,
outputDir,
callback
) {
if (callback == null) {
callback = function (error) {}
}
@ -72,7 +81,31 @@ module.exports = OutputCacheManager = {
compileDir,
outputDir,
buildId,
callback
function (err, result) {
if (err != null) {
return callback(err)
}
OutputCacheManager.collectOutputPdfSize(
result,
outputDir,
stats,
(err, result) => {
if (err) return callback(err, result)
if (!Settings.enablePdfCaching || !request.enablePdfCaching) {
return callback(null, result)
}
OutputCacheManager.saveStreamsInContentDir(
{ stats, timings },
result,
compileDir,
outputDir,
callback
)
}
)
}
)
})
},
@ -206,6 +239,107 @@ module.exports = OutputCacheManager = {
})
},
collectOutputPdfSize(outputFiles, outputDir, stats, callback) {
const outputFile = outputFiles.find((x) => x.path === 'output.pdf')
if (!outputFile) return callback(null, outputFiles)
const outputFilePath = Path.join(
outputDir,
OutputCacheManager.path(outputFile.build, outputFile.path)
)
fs.stat(outputFilePath, (err, stat) => {
if (err) return callback(err, outputFiles)
outputFile.size = stat.size
stats['pdf-size'] = outputFile.size
callback(null, outputFiles)
})
},
saveStreamsInContentDir(
{ stats, timings },
outputFiles,
compileDir,
outputDir,
callback
) {
const cacheRoot = Path.join(outputDir, OutputCacheManager.CONTENT_SUBDIR)
// check if content dir exists
OutputCacheManager.ensureContentDir(cacheRoot, function (err, contentDir) {
if (err) return callback(err, outputFiles)
const outputFile = outputFiles.find((x) => x.path === 'output.pdf')
if (outputFile) {
// possibly we should copy the file from the build dir here
const outputFilePath = Path.join(
outputDir,
OutputCacheManager.path(outputFile.build, outputFile.path)
)
const timer = new Metrics.Timer('compute-pdf-ranges')
ContentCacheManager.update(contentDir, outputFilePath, function (
err,
ranges
) {
if (err) return callback(err, outputFiles)
const [contentRanges, newContentRanges] = ranges
if (Settings.enablePdfCachingDark) {
// In dark mode we are doing the computation only and do not emit
// any ranges to the frontend.
} else {
outputFile.contentId = Path.basename(contentDir)
outputFile.ranges = contentRanges
}
timings['compute-pdf-caching'] = timer.done()
stats['pdf-caching-n-ranges'] = contentRanges.length
stats['pdf-caching-total-ranges-size'] = contentRanges.reduce(
(sum, next) => sum + (next.end - next.start),
0
)
stats['pdf-caching-n-new-ranges'] = newContentRanges.length
stats['pdf-caching-new-ranges-size'] = newContentRanges.reduce(
(sum, next) => sum + (next.end - next.start),
0
)
callback(null, outputFiles)
})
} else {
callback(null, outputFiles)
}
})
},
ensureContentDir(contentRoot, callback) {
fse.ensureDir(contentRoot, function (err) {
if (err != null) {
return callback(err)
}
fs.readdir(contentRoot, function (err, results) {
const dirs = results.sort()
const contentId = dirs.find((dir) =>
OutputCacheManager.BUILD_REGEX.test(dir)
)
if (contentId) {
callback(null, Path.join(contentRoot, contentId))
} else {
// make a content directory
OutputCacheManager.generateBuildId(function (err, contentId) {
if (err) {
return callback(err)
}
const contentDir = Path.join(contentRoot, contentId)
fse.ensureDir(contentDir, function (err) {
if (err) {
return callback(err)
}
return callback(null, contentDir)
})
})
}
})
})
},
archiveLogs(outputFiles, compileDir, outputDir, buildId, callback) {
if (callback == null) {
callback = function (error) {}

View file

@ -50,6 +50,14 @@ module.exports = RequestParser = {
type: 'string'
}
)
response.enablePdfCaching = this._parseAttribute(
'enablePdfCaching',
compile.options.enablePdfCaching,
{
default: false,
type: 'boolean'
}
)
response.timeout = this._parseAttribute(
'timeout',
compile.options.timeout,

View file

@ -63,7 +63,12 @@ module.exports = {
texliveOpenoutAny: process.env.TEXLIVE_OPENOUT_ANY,
sentry: {
dsn: process.env.SENTRY_DSN
}
},
enablePdfCaching: process.env.ENABLE_PDF_CACHING === 'true',
enablePdfCachingDark: process.env.ENABLE_PDF_CACHING_DARK === 'true',
pdfCachingMinChunkSize:
parseInt(process.env.PDF_CACHING_MIN_CHUNK_SIZE, 10) || 1024
}
if (process.env.ALLOWED_COMPILE_GROUPS) {

View file

@ -29,6 +29,7 @@ services:
NODE_ENV: test
NODE_OPTIONS: "--unhandled-rejections=strict"
TEXLIVE_IMAGE:
ENABLE_PDF_CACHING: "true"
command: npm run test:acceptance:_run

View file

@ -38,5 +38,6 @@ services:
LOG_LEVEL: ERROR
NODE_ENV: test
NODE_OPTIONS: "--unhandled-rejections=strict"
ENABLE_PDF_CACHING: "true"
command: npm run --silent test:acceptance

View file

@ -0,0 +1,16 @@
const request = require('request')
const Settings = require('settings-sharelatex')
after(function (done) {
request(
{
url: `${Settings.apis.clsi.url}/metrics`
},
(err, response, body) => {
if (err) return done(err)
console.error('-- metrics --')
console.error(body)
console.error('-- metrics --')
done()
}
)
})

View file

@ -30,6 +30,10 @@ module.exports = Client = {
if (callback == null) {
callback = function (error, res, body) {}
}
if (data) {
// Enable pdf caching unless disabled explicitly.
data.options = Object.assign({}, { enablePdfCaching: true }, data.options)
}
return request.post(
{
url: `${this.host}/project/${project_id}/compile`,

View file

@ -113,6 +113,8 @@ describe('CompileController', function () {
this.ProjectPersistenceManager.markProjectAsJustAccessed = sinon
.stub()
.callsArg(1)
this.stats = { foo: 1 }
this.timings = { bar: 2 }
this.res.status = sinon.stub().returnsThis()
return (this.res.send = sinon.stub())
})
@ -121,7 +123,7 @@ describe('CompileController', function () {
beforeEach(function () {
this.CompileManager.doCompileWithLock = sinon
.stub()
.callsArgWith(1, null, this.output_files)
.yields(null, this.output_files, this.stats, this.timings)
return this.CompileController.compile(this.req, this.res)
})
@ -150,12 +152,16 @@ describe('CompileController', function () {
compile: {
status: 'success',
error: null,
stats: this.stats,
timings: this.timings,
outputFiles: this.output_files.map((file) => {
return {
url: `${this.Settings.apis.clsi.url}/project/${this.project_id}/build/${file.build}/output/${file.path}`,
path: file.path,
type: file.type,
build: file.build
build: file.build,
// gets dropped by JSON.stringify
contentId: undefined
}
})
}
@ -180,7 +186,7 @@ describe('CompileController', function () {
]
this.CompileManager.doCompileWithLock = sinon
.stub()
.callsArgWith(1, null, this.output_files)
.yields(null, this.output_files, this.stats, this.timings)
this.CompileController.compile(this.req, this.res)
})
@ -191,12 +197,16 @@ describe('CompileController', function () {
compile: {
status: 'failure',
error: null,
stats: this.stats,
timings: this.timings,
outputFiles: this.output_files.map((file) => {
return {
url: `${this.Settings.apis.clsi.url}/project/${this.project_id}/build/${file.build}/output/${file.path}`,
path: file.path,
type: file.type,
build: file.build
build: file.build,
// gets dropped by JSON.stringify
contentId: undefined
}
})
}
@ -220,7 +230,10 @@ describe('CompileController', function () {
compile: {
status: 'error',
error: this.message,
outputFiles: []
outputFiles: [],
// JSON.stringify will omit these
stats: undefined,
timings: undefined
}
})
.should.equal(true)
@ -244,7 +257,10 @@ describe('CompileController', function () {
compile: {
status: 'timedout',
error: this.message,
outputFiles: []
outputFiles: [],
// JSON.stringify will omit these
stats: undefined,
timings: undefined
}
})
.should.equal(true)
@ -266,7 +282,10 @@ describe('CompileController', function () {
compile: {
error: null,
status: 'failure',
outputFiles: []
outputFiles: [],
// JSON.stringify will omit these
stats: undefined,
timings: undefined
}
})
.should.equal(true)

View file

@ -71,7 +71,7 @@ describe('CompileManager', function () {
this.compileDir = `${this.Settings.path.compilesDir}/${this.project_id}-${this.user_id}`
this.CompileManager.doCompile = sinon
.stub()
.callsArgWith(1, null, this.output_files)
.yields(null, this.output_files)
return (this.LockManager.runWithLock = (lockFile, runner, callback) =>
runner((err, ...result) => callback(err, ...Array.from(result))))
})
@ -172,10 +172,10 @@ describe('CompileManager', function () {
this.LatexRunner.runLatex = sinon.stub().callsArg(2)
this.OutputFileFinder.findOutputFiles = sinon
.stub()
.callsArgWith(2, null, this.output_files)
.yields(null, this.output_files)
this.OutputCacheManager.saveOutputFiles = sinon
.stub()
.callsArgWith(3, null, this.build_files)
.yields(null, this.build_files)
this.DraftModeManager.injectDraftMode = sinon.stub().callsArg(1)
return (this.TikzManager.checkMainFile = sinon.stub().callsArg(3, false))
})