Merge pull request #8730 from overleaf/jpa-refactor-pdf-caching

[web] refactor pdf caching

GitOrigin-RevId: af6b871fad652f757d8c465085b33a79b955cd6b
This commit is contained in:
Jakob Ackermann 2022-07-06 12:06:53 +01:00 committed by Copybot
parent 0c63515de6
commit 64184a7f56
18 changed files with 683 additions and 534 deletions

View file

@ -569,15 +569,19 @@ const ClsiManager = {
_parseOutputFiles(projectId, rawOutputFiles = []) {
const outputFiles = []
for (const file of rawOutputFiles) {
outputFiles.push({
const f = {
path: file.path, // the clsi is now sending this to web
url: new URL(file.url).pathname, // the location of the file on the clsi, excluding the host part
type: file.type,
build: file.build,
contentId: file.contentId,
ranges: file.ranges,
size: file.size,
})
}
if (file.path === 'output.pdf') {
f.contentId = file.contentId
f.ranges = file.ranges || []
f.size = file.size
f.createdAt = new Date()
}
outputFiles.push(f)
}
return outputFiles
},

View file

@ -1037,7 +1037,7 @@ const ProjectController = {
}
}
function partOfPdfCachingRollout(flag) {
function getTrackPdfDownload() {
if (!Settings.enablePdfCaching) {
// The feature is disabled globally.
return false
@ -1045,7 +1045,20 @@ const ProjectController = {
// Let the user opt-in only.
// The flag will opt-out of both caching and metrics collection,
// as if this editing session never happened.
return shouldDisplayFeature('enable_pdf_caching', false)
return shouldDisplayFeature('track-pdf-download', false)
}
function getPdfCachingMode() {
if (!Settings.enablePdfCaching) {
// The feature is disabled globally.
return ''
}
// Let the user opt-in only.
const v = req.query['pdf-caching-mode']
if (['service-worker', 'no-service-worker'].includes(v)) {
return v
}
return ''
}
const showPdfDetach = shouldDisplayFeature(
@ -1152,11 +1165,8 @@ const ProjectController = {
showNewSourceEditorOption,
showSymbolPalette,
showStopOnFirstError,
trackPdfDownload: partOfPdfCachingRollout('collect-metrics'),
enablePdfCaching: partOfPdfCachingRollout('enable-caching'),
resetServiceWorker:
Boolean(Settings.resetServiceWorker) &&
!shouldDisplayFeature('enable_pdf_caching', false),
trackPdfDownload: getTrackPdfDownload(),
pdfCachingMode: getPdfCachingMode(),
detachRole,
metadata: { viewport: false },
showHeaderUpgradePrompt,

View file

@ -26,9 +26,8 @@ meta(name="ol-showPdfDetach" data-type="boolean" content=showPdfDetach)
meta(name="ol-debugPdfDetach" data-type="boolean" content=debugPdfDetach)
meta(name="ol-showNewSourceEditorOption" data-type="boolean" content=showNewSourceEditorOption)
meta(name="ol-showSymbolPalette" data-type="boolean" content=showSymbolPalette)
meta(name="ol-enablePdfCaching" data-type="boolean" content=enablePdfCaching)
meta(name="ol-pdfCachingMode" content=pdfCachingMode)
meta(name="ol-trackPdfDownload" data-type="boolean" content=trackPdfDownload)
meta(name="ol-resetServiceWorker" data-type="boolean" content=resetServiceWorker)
meta(name="ol-detachRole" data-type="string" content=detachRole)
meta(name="ol-showHeaderUpgradePrompt" data-type="boolean" content=showHeaderUpgradePrompt)
meta(name="ol-showStopOnFirstError" data-type="boolean" content=showStopOnFirstError)

View file

@ -10,7 +10,7 @@ import { useProjectContext } from '../../../shared/context/project-context'
const SAY_THANKS_TIMEOUT = 10 * 1000
function FasterCompilesFeedbackContent() {
const { clsiServerId, deliveryLatencies, pdfSize, pdfUrl } =
const { clsiServerId, deliveryLatencies, pdfFile, pdfUrl } =
useCompileContext()
const { _id: projectId } = useProjectContext()
@ -52,7 +52,7 @@ function FasterCompilesFeedbackContent() {
projectId,
server: clsiServerId?.includes('-c2d-') ? 'faster' : 'normal',
feedback,
pdfSize,
pdfSize: pdfFile.size,
...deliveryLatencies,
})
setHasRatedProject(true)

View file

@ -11,7 +11,7 @@ import ErrorBoundaryFallback from './error-boundary-fallback'
import { useDetachCompileContext as useCompileContext } from '../../../shared/context/detach-compile-context'
import { captureException } from '../../../infrastructure/error-reporter'
function PdfJsViewer({ url }) {
function PdfJsViewer({ url, pdfFile }) {
const { _id: projectId } = useProjectContext()
const {
@ -90,13 +90,13 @@ function PdfJsViewer({ url }) {
setInitialised(false)
setError(undefined)
pdfJsWrapper.loadDocument(url).catch(error => {
pdfJsWrapper.loadDocument(url, pdfFile).catch(error => {
console.error(error)
setError('rendering-error')
})
return () => pdfJsWrapper.abortDocumentLoading()
}
}, [pdfJsWrapper, url, setError])
}, [pdfJsWrapper, url, pdfFile, setError])
// listen for scroll events
useEffect(() => {
@ -344,6 +344,7 @@ function PdfJsViewer({ url }) {
PdfJsViewer.propTypes = {
url: PropTypes.string.isRequired,
pdfFile: PropTypes.object,
}
export default withErrorBoundary(memo(PdfJsViewer), () => (

View file

@ -6,7 +6,7 @@ const PdfJsViewer = lazy(() =>
)
function PdfViewer() {
const { pdfUrl, pdfViewer } = useCompileContext()
const { pdfUrl, pdfFile, pdfViewer } = useCompileContext()
if (!pdfUrl) {
return null
@ -18,7 +18,7 @@ function PdfViewer() {
case 'pdfjs':
default:
return <PdfJsViewer url={pdfUrl} />
return <PdfJsViewer url={pdfUrl} pdfFile={pdfFile} />
}
}

View file

@ -167,7 +167,7 @@ export default class DocumentCompiler {
}
// use the feature flag to enable PDF caching in a ServiceWorker
if (getMeta('ol-enablePdfCaching')) {
if (getMeta('ol-pdfCachingMode')) {
params.set('enable_pdf_caching', 'true')
}

View file

@ -5,7 +5,7 @@ import getMeta from '../../../utils/meta'
// VERSION should get incremented when making changes to caching behavior or
// adjusting metrics collection.
// Keep in sync with the service worker.
const VERSION = 2
const VERSION = 3
const pdfJsMetrics = {
version: VERSION,
@ -14,6 +14,12 @@ const pdfJsMetrics = {
totalBandwidth: 0,
}
let pdfCachingMetrics
export function setCachingMetrics(metrics) {
pdfCachingMetrics = metrics
}
const SAMPLING_RATE = 0.01
export function trackPdfDownload(response, compileTimeClientE2E) {
@ -60,8 +66,10 @@ export function trackPdfDownload(response, compileTimeClientE2E) {
timings,
})
})
// Submit bandwidth counter separate from compile context.
submitPDFBandwidth({ pdfJsMetrics, serviceWorkerMetrics })
if (getMeta('ol-pdfCachingMode') === 'service-worker') {
// Submit (serviceWorker) bandwidth counter separate from compile context.
submitPDFBandwidth({ pdfJsMetrics, serviceWorkerMetrics })
}
}
return {
@ -78,9 +86,11 @@ function submitCompileMetrics(metrics) {
latencyFetch,
latencyRender,
compileTimeClientE2E,
id: pdfJsMetrics.id,
...(pdfCachingMetrics || {}),
}
sl_console.log('/event/compile-metrics', JSON.stringify(metrics))
sendMB('compile-metrics-v5', leanMetrics, SAMPLING_RATE)
sendMB('compile-metrics-v6', leanMetrics, SAMPLING_RATE)
}
function submitPDFBandwidth(metrics) {
@ -110,5 +120,5 @@ function submitPDFBandwidth(metrics) {
return
}
sl_console.log('/event/pdf-bandwidth', JSON.stringify(metrics))
sendMB('pdf-bandwidth-v5', leanMetrics, SAMPLING_RATE)
sendMB('pdf-bandwidth-v6', leanMetrics, SAMPLING_RATE)
}

View file

@ -11,7 +11,7 @@ export function handleOutputFiles(outputFiles, projectId, data) {
const result = {}
const outputFile = outputFiles.get('output.pdf')
result.pdfSize = outputFile?.size
result.pdfFile = outputFile
if (outputFile) {
// build the URL for viewing the PDF in the preview UI
@ -28,9 +28,9 @@ export function handleOutputFiles(outputFiles, projectId, data) {
params.set('verify_chunks', 'true')
}
if (getMeta('ol-enablePdfCaching')) {
if (getMeta('ol-pdfCachingMode')) {
// Tag traffic that uses the pdf caching logic.
params.set('enable_pdf_caching', 'true')
params.set('enable_pdf_caching', getMeta('ol-pdfCachingMode'))
}
result.pdfUrl = `${buildURL(outputFile, data.pdfDownloadDomain)}?${params}`

View file

@ -0,0 +1,61 @@
import { fallbackRequest, fetchRange } from './pdf-caching'
import getMeta from '../../../utils/meta'
import { captureException } from '../../../infrastructure/error-reporter'
import { setCachingMetrics } from './metrics'
export function generatePdfCachingTransportFactory(PDFJS) {
if (getMeta('ol-pdfCachingMode') !== 'no-service-worker') {
return () => null
}
const cached = new Set()
const metrics = {
failedCount: 0,
tooLargeOverheadCount: 0,
tooManyRequestsCount: 0,
cachedCount: 0,
cachedBytes: 0,
fetchedCount: 0,
fetchedBytes: 0,
requestedCount: 0,
requestedBytes: 0,
}
setCachingMetrics(metrics)
class PDFDataRangeTransport extends PDFJS.PDFDataRangeTransport {
constructor(url, pdfFile, reject) {
super(pdfFile.size, new Uint8Array())
this.url = url
this.pdfFile = pdfFile
this.reject = reject
}
requestDataRange(start, end) {
fetchRange({
url: this.url,
start,
end,
file: this.pdfFile,
metrics,
cached,
})
.catch(err => {
metrics.failedCount++
console.error('optimized pdf download error', err)
captureException(err)
return fallbackRequest({ url: this.url, start, end })
})
.then(blob => {
this.onDataRange(start, blob)
})
.catch(err => {
console.error('fatal pdf download error', err)
captureException(err)
this.reject(err)
})
}
}
return function (url, pdfFile, reject) {
return new PDFDataRangeTransport(url, pdfFile, reject)
}
}

View file

@ -0,0 +1,490 @@
import OError from '@overleaf/o-error'
const PDF_JS_CHUNK_SIZE = 128 * 1024
const MAX_SUB_REQUEST_COUNT = 4
const MAX_SUB_REQUEST_BYTES = 4 * PDF_JS_CHUNK_SIZE
const INCREMENTAL_CACHE_SIZE = 1000
const ENCODER = new TextEncoder()
function backfillEdgeBounds(file) {
if (!file.backfilledEdgeBoundsOnce) {
for (const range of file.ranges) {
if (range.objectId) {
range.objectId = ENCODER.encode(range.objectId)
range.start -= range.objectId.byteLength
}
}
}
file.backfilledEdgeBoundsOnce = true
}
/**
* @param {Array} chunks
*/
function countBytes(chunks) {
return chunks.reduce((totalBytes, chunk) => {
return totalBytes + (chunk.end - chunk.start)
}, 0)
}
/**
*
* @param {Object} metrics
* @param {number} size
* @param {number} cachedCount
* @param {number} cachedBytes
* @param {number} fetchedCount
* @param {number} fetchedBytes
*/
function trackDownloadStats(
metrics,
{ size, cachedCount, cachedBytes, fetchedCount, fetchedBytes }
) {
metrics.cachedCount += cachedCount
metrics.cachedBytes += cachedBytes
metrics.fetchedCount += fetchedCount
metrics.fetchedBytes += fetchedBytes
metrics.requestedCount++
metrics.requestedBytes += size
}
/**
* @param {Object} metrics
* @param {boolean} sizeDiffers
* @param {boolean} mismatch
* @param {boolean} success
*/
function trackChunkVerify(metrics, { sizeDiffers, mismatch, success }) {
if (sizeDiffers) {
metrics.chunkVerifySizeDiffers |= 0
metrics.chunkVerifySizeDiffers += 1
}
if (mismatch) {
metrics.chunkVerifyMismatch |= 0
metrics.chunkVerifyMismatch += 1
}
if (success) {
metrics.chunkVerifySuccess |= 0
metrics.chunkVerifySuccess += 1
}
}
/**
* @param chunk
* @param {ArrayBuffer} arrayBuffer
* @return {Uint8Array}
*/
function backFillObjectContext(chunk, arrayBuffer) {
if (!chunk.objectId) {
// This is a dynamic chunk
return new Uint8Array(arrayBuffer)
}
const { start, end, objectId } = chunk
const header = Uint8Array.from(objectId)
const fullBuffer = new Uint8Array(end - start)
fullBuffer.set(header, 0)
fullBuffer.set(new Uint8Array(arrayBuffer), objectId.length)
return fullBuffer
}
/**
* @param {Array} chunks
* @param {number} start
* @param {number} end
* @returns {Array}
*/
function getMatchingChunks(chunks, start, end) {
const matchingChunks = []
for (const chunk of chunks) {
if (chunk.end <= start) {
// no overlap:
// | REQUESTED_RANGE |
// | CHUNK |
continue
}
if (chunk.start >= end) {
// no overlap:
// | REQUESTED_RANGE |
// | CHUNK |
break
}
matchingChunks.push(chunk)
}
return matchingChunks
}
/**
* @param {Array} potentialChunks
* @param {Set} cached
* @param {Object} metrics
*/
function cutRequestAmplification(potentialChunks, cached, metrics) {
const chunks = []
const newChunks = []
let tooManyRequests = false
for (const chunk of potentialChunks) {
if (cached.has(chunk.hash)) {
chunks.push(chunk)
continue
}
if (newChunks.length < MAX_SUB_REQUEST_COUNT) {
chunks.push(chunk)
newChunks.push(chunk)
} else {
tooManyRequests = true
}
}
if (tooManyRequests) {
metrics.tooManyRequestsCount++
}
if (cached.size > INCREMENTAL_CACHE_SIZE) {
for (const key of cached) {
if (cached.size < INCREMENTAL_CACHE_SIZE) {
break
}
// Map keys are stored in insertion order.
// We re-insert keys on cache hit, 'cached' is a cheap LRU.
cached.delete(key)
}
}
return { chunks, newChunks }
}
/**
* @param {Array} chunks
* @param {number} start
* @param {number} end
* @returns {Array}
*/
function getInterleavingDynamicChunks(chunks, start, end) {
const dynamicChunks = []
for (const chunk of chunks) {
if (start < chunk.start) {
dynamicChunks.push({ start, end: chunk.start })
}
start = chunk.end
}
if (start < end) {
dynamicChunks.push({ start, end })
}
return dynamicChunks
}
/**
*
* @param {Response} response
*/
function getServerTime(response) {
const raw = response.headers.get('Date')
if (!raw) return new Date()
return new Date(raw)
}
/**
*
* @param {Response} response
*/
function getResponseSize(response) {
const raw = response.headers.get('Content-Length')
if (!raw) return 0
return parseInt(raw, 10)
}
/**
*
* @param {Response} response
* @param chunk
*/
function getMultipartBoundary(response, chunk) {
if (!Array.isArray(chunk)) return ''
const raw = response.headers.get('Content-Type')
if (raw.includes('multipart/byteranges')) {
const idx = raw.indexOf('boundary=')
if (idx !== -1) return raw.slice(idx + 'boundary='.length)
}
throw new OError('missing boundary on multipart request', {
headers: Object.fromEntries(response.headers.entries()),
chunk,
})
}
/**
* @param {Object} response
* @param {Object} file
* @param {Object} metrics
*/
function resolveMultiPartResponses(response, file, metrics) {
const { chunk: chunks, data, boundary } = response
if (!boundary) {
return [response]
}
const responses = []
let offsetStart = 0
for (const chunk of chunks) {
const header = `\r\n--${boundary}\r\nContent-Type: application/pdf\r\nContent-Range: bytes ${
chunk.start
}-${chunk.end - 1}/${file.size}\r\n\r\n`
const headerSize = header.length
// Verify header content. A proxy might have tampered with it.
const headerRaw = ENCODER.encode(header)
if (
!data
.subarray(offsetStart, offsetStart + headerSize)
.every((v, idx) => v === headerRaw[idx])
) {
metrics.headerVerifyFailure |= 0
metrics.headerVerifyFailure++
throw new OError('multipart response header does not match', {
actual: new TextDecoder().decode(
data.subarray(offsetStart, offsetStart + headerSize)
),
expected: header,
})
}
offsetStart += headerSize
const chunkSize = chunk.end - chunk.start
responses.push({
chunk,
data: data.subarray(offsetStart, offsetStart + chunkSize),
})
offsetStart += chunkSize
}
return responses
}
/**
*
* @param {Response} response
*/
function checkChunkResponse(response) {
if (!(response.status === 206 || response.status === 200)) {
throw new OError('non successful response status: ' + response.status)
}
}
/**
*
* @param {string} url
* @param {number} start
* @param {number} end
*/
export function fallbackRequest({ url, start, end }) {
return fetch(url, { headers: { Range: `bytes=${start}-${end - 1}` } }).then(
response => {
checkChunkResponse(response)
return response.arrayBuffer()
}
)
}
/**
*
* @param {string} url
* @param {number} start
* @param {number} end
* @param {Object} metrics
* @param {Uint8Array} actual
*/
async function verifyRange({ url, start, end, metrics, actual }) {
let expectedRaw
try {
expectedRaw = await fallbackRequest({ url, start, end })
} catch (error) {
throw OError.tag(error, 'cannot verify range', { url, start, end })
}
const expected = new Uint8Array(expectedRaw)
const stats = {}
if (actual.byteLength !== expected.byteLength) {
stats.sizeDiffers = true
} else if (!expected.every((v, idx) => v === actual[idx])) {
stats.mismatch = true
} else {
stats.success = true
}
trackChunkVerify(metrics, stats)
return expected
}
/**
*
* @param {string} url
* @param {number} start
* @param {number} end
* @param {Object} file
* @param {Object} metrics
* @param {Set} cached
*/
export async function fetchRange({ url, start, end, file, metrics, cached }) {
file.createdAt = new Date(file.createdAt)
backfillEdgeBounds(file)
// Check that handling the range request won't trigger excessive sub-requests,
// (to avoid unwanted latency compared to the original request).
const { chunks, newChunks } = cutRequestAmplification(
getMatchingChunks(file.ranges, start, end),
cached,
metrics
)
const dynamicChunks = getInterleavingDynamicChunks(chunks, start, end)
const chunksSize = countBytes(newChunks)
const size = end - start
if (chunks.length === 0 && dynamicChunks.length === 1) {
// fall back to the original range request when no chunks are cached.
trackDownloadStats(metrics, {
size,
cachedCount: 0,
cachedBytes: 0,
fetchedCount: 1,
fetchedBytes: size,
})
return fallbackRequest({ url, start, end })
}
if (
chunksSize > MAX_SUB_REQUEST_BYTES &&
!(dynamicChunks.length === 0 && newChunks.length <= 1)
) {
// fall back to the original range request when a very large amount of
// object data would be requested, unless it is the only object in the
// request or everything is already cached.
metrics.tooLargeOverheadCount++
trackDownloadStats(metrics, {
size,
cachedCount: 0,
cachedBytes: 0,
fetchedCount: 1,
fetchedBytes: size,
})
return fallbackRequest({ url, start, end })
}
const byteRanges = dynamicChunks
.map(chunk => `${chunk.start}-${chunk.end - 1}`)
.join(',')
const coalescedDynamicChunks = []
switch (dynamicChunks.length) {
case 0:
break
case 1:
coalescedDynamicChunks.push({
chunk: dynamicChunks[0],
url,
init: { headers: { Range: `bytes=${byteRanges}` } },
})
break
default:
coalescedDynamicChunks.push({
chunk: dynamicChunks,
url,
init: { headers: { Range: `bytes=${byteRanges}` } },
})
}
const params = new URL(url).searchParams
// drop no needed params
params.delete('enable_pdf_caching')
params.delete('verify_chunks')
const query = params.toString()
// The schema of `url` is https://domain/project/:id/user/:id/build/... for
// authenticated and https://domain/project/:id/build/... for
// unauthenticated users. Cut it before /build/.
// The path may have an optional /zone/b prefix too.
const perUserPrefix = url.slice(0, url.indexOf('/build/'))
const requests = chunks
.map(chunk => ({
chunk,
url: `${perUserPrefix}/content/${file.contentId}/${chunk.hash}?${query}`,
}))
.concat(coalescedDynamicChunks)
let cachedCount = 0
let cachedBytes = 0
let fetchedCount = 0
let fetchedBytes = 0
const reassembledBlob = new Uint8Array(size)
const rawResponses = await Promise.all(
requests.map(async ({ chunk, url, init }) => {
try {
const response = await fetch(url, init)
checkChunkResponse(response)
const boundary = getMultipartBoundary(response, chunk)
const blobFetchDate = getServerTime(response)
const blobSize = getResponseSize(response)
if (blobFetchDate && blobSize) {
// Example: 2MB PDF, 1MB image, 128KB PDF.js chunk.
// | pdf.js chunk |
// | A BIG IMAGE BLOB |
// | THE FULL PDF |
if (chunk.hash && blobFetchDate < file.createdAt) {
const usedChunkSection =
Math.min(end, chunk.end) - Math.max(start, chunk.start)
cachedCount++
cachedBytes += usedChunkSection
// Roll the position of the hash in the Map.
cached.delete(chunk.hash)
cached.add(chunk.hash)
} else {
// Blobs are fetched in bulk, record the full size.
fetchedCount++
fetchedBytes += blobSize
}
}
return {
boundary,
chunk,
data: backFillObjectContext(
chunk,
// response.arrayBuffer() yields the first multipart section only.
await (await response.blob()).arrayBuffer()
),
}
} catch (error) {
throw OError.tag(error, 'cannot fetch chunk', { url })
}
})
)
rawResponses
.flatMap(r => resolveMultiPartResponses(r, file, metrics))
.forEach(({ chunk, data }) => {
// overlap:
// | REQUESTED_RANGE |
// | CHUNK |
const offsetStart = Math.max(start - chunk.start, 0)
// overlap:
// | REQUESTED_RANGE |
// | CHUNK |
const offsetEnd = Math.max(chunk.end - end, 0)
if (offsetStart > 0 || offsetEnd > 0) {
// compute index positions for slice to handle case where offsetEnd=0
const chunkSize = chunk.end - chunk.start
data = data.subarray(offsetStart, chunkSize - offsetEnd)
}
const insertPosition = Math.max(chunk.start - start, 0)
reassembledBlob.set(data, insertPosition)
})
trackDownloadStats(metrics, {
size,
cachedCount,
cachedBytes,
fetchedCount,
fetchedBytes,
})
if (url.includes('verify_chunks=true')) {
return await verifyRange({
url,
start,
end,
metrics,
actual: reassembledBlob,
})
}
return reassembledBlob
}

View file

@ -1,4 +1,5 @@
import { captureMessage } from '../../../infrastructure/error-reporter'
import { generatePdfCachingTransportFactory } from './pdf-caching-transport'
const params = new URLSearchParams(window.location.search)
const disableFontFace = params.get('disable-font-face') === 'true'
@ -19,6 +20,7 @@ export default class PDFJSWrapper {
})
this.PDFJS = PDFJS
this.genPdfCachingTransport = generatePdfCachingTransportFactory(PDFJS)
this.PDFJSViewer = PDFJSViewer
this.cMapUrl = cMapUrl
this.imageResourcesPath = imageResourcesPath
@ -57,7 +59,7 @@ export default class PDFJSWrapper {
}
// load a document from a URL
loadDocument(url) {
loadDocument(url, pdfFile) {
// cancel any previous loading task
if (this.loadDocumentTask) {
this.loadDocumentTask.destroy()
@ -74,6 +76,7 @@ export default class PDFJSWrapper {
disableAutoFetch: true,
disableStream,
textLayerMode: 2, // PDFJSViewer.TextLayerMode.ENABLE,
range: this.genPdfCachingTransport(url, pdfFile, reject),
})
this.loadDocumentTask.promise

View file

@ -435,10 +435,10 @@ If the project has been renamed please look in your project list for a new proje
}
)
if (getMeta('ol-resetServiceWorker')) {
unregisterServiceWorker()
} else if (getMeta('ol-enablePdfCaching')) {
if (getMeta('ol-pdfCachingMode') === 'service-worker') {
loadServiceWorker()
} else {
unregisterServiceWorker()
}
angular.module('SharelatexApp').config(function ($provide) {

View file

@ -1,19 +1,17 @@
import { v4 as uuid } from 'uuid'
import { fetchRange } from './features/pdf-preview/util/pdf-caching'
const OError = require('@overleaf/o-error')
// VERSION should get incremented when making changes to caching behavior or
// adjusting metrics collection.
// Keep in sync with PdfJsMetrics.
const VERSION = 2
const VERSION = 3
const CLEAR_CACHE_REQUEST_MATCHER = /^\/project\/[0-9a-f]{24}\/output$/
const COMPILE_REQUEST_MATCHER = /^\/project\/[0-9a-f]{24}\/compile$/
const PDF_REQUEST_MATCHER =
/^(\/zone\/.)?(\/project\/[0-9a-f]{24}\/.*\/output.pdf)$/
const PDF_JS_CHUNK_SIZE = 128 * 1024
const MAX_SUBREQUEST_COUNT = 4
const MAX_SUBREQUEST_BYTES = 4 * PDF_JS_CHUNK_SIZE
const INCREMENTAL_CACHE_SIZE = 1000
// Each compile request defines a context (essentially the specific pdf file for
// that compile), requests for that pdf file can use the hashes in the compile
@ -101,57 +99,6 @@ function expirePdfContexts() {
})
}
/**
*
* @param {Object} metrics
* @param {number} size
* @param {number} cachedCount
* @param {number} cachedBytes
* @param {number} fetchedCount
* @param {number} fetchedBytes
*/
function trackDownloadStats(
metrics,
{ size, cachedCount, cachedBytes, fetchedCount, fetchedBytes }
) {
metrics.cachedCount += cachedCount
metrics.cachedBytes += cachedBytes
metrics.fetchedCount += fetchedCount
metrics.fetchedBytes += fetchedBytes
metrics.requestedCount++
metrics.requestedBytes += size
}
/**
* @param {Object} metrics
* @param {boolean} sizeDiffers
* @param {boolean} mismatch
* @param {boolean} success
*/
function trackChunkVerify(metrics, { sizeDiffers, mismatch, success }) {
if (sizeDiffers) {
metrics.chunkVerifySizeDiffers |= 0
metrics.chunkVerifySizeDiffers += 1
}
if (mismatch) {
metrics.chunkVerifyMismatch |= 0
metrics.chunkVerifyMismatch += 1
}
if (success) {
metrics.chunkVerifySuccess |= 0
metrics.chunkVerifySuccess += 1
}
}
/**
* @param {Array} chunks
*/
function countBytes(chunks) {
return chunks.reduce((totalBytes, chunk) => {
return totalBytes + (chunk.end - chunk.start)
}, 0)
}
/**
* @param {FetchEvent} event
*/
@ -266,7 +213,6 @@ function processPdfRequest(
return event.respondWith(response)
}
const verifyChunks = event.request.url.includes('verify_chunks=true')
const rangeHeader =
event.request.headers.get('Range') || `bytes=0-${file.size - 1}`
const [start, last] = rangeHeader
@ -275,307 +221,35 @@ function processPdfRequest(
.map(i => parseInt(i, 10))
const end = last + 1
// Check that handling the range request won't trigger excessive subrequests,
// (to avoid unwanted latency compared to the original request).
const { chunks, newChunks } = cutRequestAmplification(
getMatchingChunks(file.ranges, start, end),
cached,
metrics
)
const dynamicChunks = getInterleavingDynamicChunks(chunks, start, end)
const chunksSize = countBytes(newChunks)
const size = end - start
if (chunks.length === 0 && dynamicChunks.length === 1) {
// fall back to the original range request when no chunks are cached.
trackDownloadStats(metrics, {
size,
cachedCount: 0,
cachedBytes: 0,
fetchedCount: 1,
fetchedBytes: size,
return event.respondWith(
fetchRange({
url: event.request.url,
start,
end,
file,
pdfCreatedAt,
metrics,
cached,
})
return
}
if (
chunksSize > MAX_SUBREQUEST_BYTES &&
!(dynamicChunks.length === 0 && newChunks.length <= 1)
) {
// fall back to the original range request when a very large amount of
// object data would be requested, unless it is the only object in the
// request or everything is already cached.
metrics.tooLargeOverheadCount++
trackDownloadStats(metrics, {
size,
cachedCount: 0,
cachedBytes: 0,
fetchedCount: 1,
fetchedBytes: size,
})
return
}
// URL prefix is /project/:id/user/:id/build/... or /project/:id/build/...
// for authenticated and unauthenticated users respectively.
const perUserPrefix = file.url.slice(0, file.url.indexOf('/build/'))
const byteRanges = dynamicChunks
.map(chunk => `${chunk.start}-${chunk.end - 1}`)
.join(',')
const coalescedDynamicChunks = []
switch (dynamicChunks.length) {
case 0:
break
case 1:
coalescedDynamicChunks.push({
chunk: dynamicChunks[0],
url: event.request.url,
init: { headers: { Range: `bytes=${byteRanges}` } },
})
break
default:
coalescedDynamicChunks.push({
chunk: dynamicChunks,
url: event.request.url,
init: { headers: { Range: `bytes=${byteRanges}` } },
})
}
const requests = chunks
.map(chunk => {
const path = `${perUserPrefix}/content/${file.contentId}/${chunk.hash}`
const url = new URL(path, event.request.url)
if (clsiServerId) {
url.searchParams.set('clsiserverid', clsiServerId)
}
if (compileGroup) {
url.searchParams.set('compileGroup', compileGroup)
}
return { chunk, url: url.toString() }
})
.concat(coalescedDynamicChunks)
let cachedCount = 0
let cachedBytes = 0
let fetchedCount = 0
let fetchedBytes = 0
const reAssembledBlob = new Uint8Array(size)
event.respondWith(
Promise.all(
requests.map(({ chunk, url, init }) =>
fetch(url, init)
.then(response => {
if (!(response.status === 206 || response.status === 200)) {
throw new OError(
'non successful response status: ' + response.status
)
}
const boundary = getMultipartBoundary(response)
if (Array.isArray(chunk) && !boundary) {
throw new OError('missing boundary on multipart request', {
headers: Object.fromEntries(response.headers.entries()),
chunk,
})
}
const blobFetchDate = getServerTime(response)
const blobSize = getResponseSize(response)
if (blobFetchDate && blobSize) {
const chunkSize =
Math.min(end, chunk.end) - Math.max(start, chunk.start)
// Example: 2MB PDF, 1MB image, 128KB PDF.js chunk.
// | pdf.js chunk |
// | A BIG IMAGE BLOB |
// | THE FULL PDF |
if (blobFetchDate < pdfCreatedAt) {
cachedCount++
cachedBytes += chunkSize
// Roll the position of the hash in the Map.
cached.delete(chunk.hash)
cached.add(chunk.hash)
} else {
// Blobs are fetched in bulk.
fetchedCount++
fetchedBytes += blobSize
}
}
return response
.blob()
.then(blob => blob.arrayBuffer())
.then(arraybuffer => {
return {
boundary,
chunk,
data: backFillObjectContext(chunk, arraybuffer),
}
})
})
.catch(error => {
throw OError.tag(error, 'cannot fetch chunk', { url })
})
)
)
.then(rawResponses => {
const responses = []
for (const response of rawResponses) {
if (response.boundary) {
responses.push(
...getMultiPartResponses(response, file, metrics, verifyChunks)
)
} else {
responses.push(response)
}
}
responses.forEach(({ chunk, data }) => {
// overlap:
// | REQUESTED_RANGE |
// | CHUNK |
const offsetStart = Math.max(start - chunk.start, 0)
// overlap:
// | REQUESTED_RANGE |
// | CHUNK |
const offsetEnd = Math.max(chunk.end - end, 0)
if (offsetStart > 0 || offsetEnd > 0) {
// compute index positions for slice to handle case where offsetEnd=0
const chunkSize = chunk.end - chunk.start
data = data.subarray(offsetStart, chunkSize - offsetEnd)
}
const insertPosition = Math.max(chunk.start - start, 0)
reAssembledBlob.set(data, insertPosition)
})
let verifyProcess = Promise.resolve(reAssembledBlob)
if (verifyChunks) {
verifyProcess = fetch(event.request)
.then(response => response.arrayBuffer())
.then(arrayBuffer => {
const fullBlob = new Uint8Array(arrayBuffer)
const stats = {}
if (reAssembledBlob.byteLength !== fullBlob.byteLength) {
stats.sizeDiffers = true
} else if (
!reAssembledBlob.every((v, idx) => v === fullBlob[idx])
) {
stats.mismatch = true
} else {
stats.success = true
}
trackChunkVerify(metrics, stats)
if (stats.success === true) {
return reAssembledBlob
} else {
return fullBlob
}
})
}
return verifyProcess.then(blob => {
trackDownloadStats(metrics, {
size,
cachedCount,
cachedBytes,
fetchedCount,
fetchedBytes,
})
return new Response(blob, {
status: 206,
headers: {
'Accept-Ranges': 'bytes',
'Content-Length': size,
'Content-Range': `bytes ${start}-${last}/${file.size}`,
'Content-Type': 'application/pdf',
},
})
.then(blob => {
return new Response(blob, {
status: 206,
headers: {
'Accept-Ranges': 'bytes',
'Content-Length': end - start,
'Content-Range': `bytes ${start}-${last}/${file.size}`,
'Content-Type': 'application/pdf',
},
})
})
.catch(error => {
fetchedBytes += size
metrics.failedCount++
trackDownloadStats(metrics, {
size,
cachedCount: 0,
cachedBytes: 0,
fetchedCount,
fetchedBytes,
})
reportError(event, OError.tag(error, 'failed to compose pdf response'))
return fetch(event.request)
})
)
}
/**
*
* @param {Response} response
*/
function getServerTime(response) {
const raw = response.headers.get('Date')
if (!raw) return new Date()
return new Date(raw)
}
/**
*
* @param {Response} response
*/
function getResponseSize(response) {
const raw = response.headers.get('Content-Length')
if (!raw) return 0
return parseInt(raw, 10)
}
/**
*
* @param {Response} response
*/
function getMultipartBoundary(response) {
const raw = response.headers.get('Content-Type')
if (!raw.includes('multipart/byteranges')) return ''
const idx = raw.indexOf('boundary=')
if (idx === -1) return ''
return raw.slice(idx + 'boundary='.length)
}
/**
* @param {Object} response
* @param {Object} file
* @param {Object} metrics
* @param {boolean} verifyChunks
*/
function getMultiPartResponses(response, file, metrics, verifyChunks) {
const { chunk: chunks, data, boundary } = response
const responses = []
let offsetStart = 0
for (const chunk of chunks) {
const header = `\r\n--${boundary}\r\nContent-Type: application/pdf\r\nContent-Range: bytes ${
chunk.start
}-${chunk.end - 1}/${file.size}\r\n\r\n`
const headerSize = header.length
// Verify header content. A proxy might have tampered with it.
const headerRaw = ENCODER.encode(header)
if (
!data
.subarray(offsetStart, offsetStart + headerSize)
.every((v, idx) => v === headerRaw[idx])
) {
metrics.headerVerifyFailure |= 0
metrics.headerVerifyFailure++
throw new OError('multipart response header does not match', {
actual: new TextDecoder().decode(
data.subarray(offsetStart, offsetStart + headerSize)
),
expected: header,
})
}
offsetStart += headerSize
const chunkSize = chunk.end - chunk.start
responses.push({
chunk,
data: data.subarray(offsetStart, offsetStart + chunkSize),
})
offsetStart += chunkSize
}
return responses
}
/**
* @param {FetchEvent} event
* @param {Response} response
@ -584,15 +258,11 @@ function getMultiPartResponses(response, file, metrics, verifyChunks) {
function handleCompileResponse(event, response, body) {
if (!body || body.status !== 'success') return
const pdfCreatedAt = getServerTime(response)
for (const file of body.outputFiles) {
if (file.path !== 'output.pdf') continue // not the pdf used for rendering
if (file.ranges) {
file.ranges.forEach(backFillEdgeBounds)
if (file.ranges?.length) {
const { clsiServerId, compileGroup } = body
registerPdfContext(event.clientId, file.url, {
pdfCreatedAt,
file,
clsiServerId,
compileGroup,
@ -602,117 +272,6 @@ function handleCompileResponse(event, response, body) {
}
}
const ENCODER = new TextEncoder()
function backFillEdgeBounds(chunk) {
if (chunk.objectId) {
chunk.objectId = ENCODER.encode(chunk.objectId)
chunk.start -= chunk.objectId.byteLength
}
return chunk
}
/**
* @param chunk
* @param {ArrayBuffer} arrayBuffer
* @return {Uint8Array}
*/
function backFillObjectContext(chunk, arrayBuffer) {
if (!chunk.objectId) {
// This is a dynamic chunk
return new Uint8Array(arrayBuffer)
}
const { start, end, objectId } = chunk
const header = Uint8Array.from(objectId)
const fullBuffer = new Uint8Array(end - start)
fullBuffer.set(header, 0)
fullBuffer.set(new Uint8Array(arrayBuffer), objectId.length)
return fullBuffer
}
/**
* @param {Array} chunks
* @param {number} start
* @param {number} end
* @returns {Array}
*/
function getMatchingChunks(chunks, start, end) {
const matchingChunks = []
for (const chunk of chunks) {
if (chunk.end <= start) {
// no overlap:
// | REQUESTED_RANGE |
// | CHUNK |
continue
}
if (chunk.start >= end) {
// no overlap:
// | REQUESTED_RANGE |
// | CHUNK |
break
}
matchingChunks.push(chunk)
}
return matchingChunks
}
/**
* @param {Array} potentialChunks
* @param {Set} cached
* @param {Object} metrics
*/
function cutRequestAmplification(potentialChunks, cached, metrics) {
const chunks = []
const newChunks = []
let tooManyRequests = false
for (const chunk of potentialChunks) {
if (cached.has(chunk.hash)) {
chunks.push(chunk)
continue
}
if (newChunks.length < MAX_SUBREQUEST_COUNT) {
chunks.push(chunk)
newChunks.push(chunk)
} else {
tooManyRequests = true
}
}
if (tooManyRequests) {
metrics.tooManyRequestsCount++
}
if (cached.size > INCREMENTAL_CACHE_SIZE) {
for (const key of cached) {
if (cached.size < INCREMENTAL_CACHE_SIZE) {
break
}
// Map keys are stored in insertion order.
// We re-insert keys on cache hit, 'cached' is a cheap LRU.
cached.delete(key)
}
}
return { chunks, newChunks }
}
/**
* @param {Array} chunks
* @param {number} start
* @param {number} end
* @returns {Array}
*/
function getInterleavingDynamicChunks(chunks, start, end) {
const dynamicChunks = []
for (const chunk of chunks) {
if (start < chunk.start) {
dynamicChunks.push({ start, end: chunk.start })
}
start = chunk.end
}
if (start < end) {
dynamicChunks.push({ start, end })
}
return dynamicChunks
}
/**
* @param {FetchEvent} event
*/

View file

@ -36,7 +36,7 @@ export function DetachCompileProvider({ children }) {
logEntries: _logEntries,
logEntryAnnotations: _logEntryAnnotations,
pdfDownloadUrl: _pdfDownloadUrl,
pdfSize: _pdfSize,
pdfFile: _pdfFile,
pdfUrl: _pdfUrl,
pdfViewer: _pdfViewer,
position: _position,
@ -155,9 +155,9 @@ export function DetachCompileProvider({ children }) {
'detacher',
'detached'
)
const [pdfSize] = useDetachStateWatcher(
'pdfSize',
_pdfSize,
const [pdfFile] = useDetachStateWatcher(
'pdfFile',
_pdfFile,
'detacher',
'detached'
)
@ -362,7 +362,7 @@ export function DetachCompileProvider({ children }) {
logEntryAnnotations,
logEntries,
pdfDownloadUrl,
pdfSize,
pdfFile,
pdfUrl,
pdfViewer,
position,
@ -411,7 +411,7 @@ export function DetachCompileProvider({ children }) {
logEntryAnnotations,
logEntries,
pdfDownloadUrl,
pdfSize,
pdfFile,
pdfUrl,
pdfViewer,
position,

View file

@ -46,7 +46,7 @@ export const CompileContextPropTypes = {
logEntries: PropTypes.object,
logEntryAnnotations: PropTypes.object,
pdfDownloadUrl: PropTypes.string,
pdfSize: PropTypes.number,
pdfFile: PropTypes.object,
pdfUrl: PropTypes.string,
pdfViewer: PropTypes.string,
position: PropTypes.object,
@ -103,7 +103,8 @@ export function LocalCompileProvider({ children }) {
// the URL for loading the PDF in the preview pane
const [pdfUrl, setPdfUrl] = useScopeValueSetterOnly('pdf.url')
const [pdfSize, setPdfSize] = useState(0)
// low level details for metrics
const [pdfFile, setPdfFile] = useState()
// the project is considered to be "uncompiled" if a doc has changed since the last compile started
const [uncompiled, setUncompiled] = useScopeValue('pdf.uncompiled')
@ -288,7 +289,7 @@ export function LocalCompileProvider({ children }) {
const result = handleOutputFiles(outputFiles, projectId, data)
if (data.status === 'success') {
setPdfDownloadUrl(result.pdfDownloadUrl)
setPdfSize(result.pdfSize)
setPdfFile(result.pdfFile)
setPdfUrl(result.pdfUrl)
}
@ -406,7 +407,7 @@ export function LocalCompileProvider({ children }) {
setLogEntries,
setLogEntryAnnotations,
setPdfDownloadUrl,
setPdfSize,
setPdfFile,
setPdfUrl,
])
@ -507,7 +508,7 @@ export function LocalCompileProvider({ children }) {
logEntryAnnotations,
logEntries,
pdfDownloadUrl,
pdfSize,
pdfFile,
pdfUrl,
pdfViewer,
position,
@ -557,7 +558,7 @@ export function LocalCompileProvider({ children }) {
logEntryAnnotations,
position,
pdfDownloadUrl,
pdfSize,
pdfFile,
pdfUrl,
pdfViewer,
rawLog,

View file

@ -2,6 +2,7 @@ const sinon = require('sinon')
const { expect } = require('chai')
const modulePath = '../../../../app/src/Features/Compile/ClsiManager.js'
const SandboxedModule = require('sandboxed-module')
const tk = require('timekeeper')
describe('ClsiManager', function () {
beforeEach(function () {
@ -72,6 +73,11 @@ describe('ClsiManager', function () {
this.project_id = 'project-id'
this.user_id = 'user-id'
this.callback = sinon.stub()
tk.freeze(Date.now())
})
after(function () {
tk.reset()
})
describe('sendRequest', function () {
@ -136,9 +142,10 @@ describe('ClsiManager', function () {
path: 'output.pdf',
type: 'pdf',
build: 1234,
ranges: [],
createdAt: new Date(),
// gets dropped by JSON.stringify
contentId: undefined,
ranges: undefined,
size: undefined,
},
{
@ -146,10 +153,6 @@ describe('ClsiManager', function () {
path: 'output.log',
type: 'log',
build: 1234,
// gets dropped by JSON.stringify
contentId: undefined,
ranges: undefined,
size: undefined,
},
]
this.callback
@ -207,16 +210,13 @@ describe('ClsiManager', function () {
contentId: '123-321',
ranges: [{ start: 1, end: 42, hash: 'foo' }],
size: 42,
createdAt: new Date(),
},
{
url: `/project/${this.project_id}/user/${this.user_id}/build/1234/output/output.log`,
path: 'output.log',
type: 'log',
build: 1234,
// gets dropped by JSON.stringify
contentId: undefined,
ranges: undefined,
size: undefined,
},
]
const validationError = undefined
@ -417,9 +417,10 @@ describe('ClsiManager', function () {
path: 'output.pdf',
type: 'pdf',
build: 1234,
ranges: [],
createdAt: new Date(),
// gets dropped by JSON.stringify
contentId: undefined,
ranges: undefined,
size: undefined,
},
{
@ -427,15 +428,13 @@ describe('ClsiManager', function () {
path: 'output.log',
type: 'log',
build: 1234,
// gets dropped by JSON.stringify
contentId: undefined,
ranges: undefined,
size: undefined,
},
]
this.callback
.calledWith(null, this.status, outputFiles)
.should.equal(true)
this.callback.should.have.been.calledWith(
null,
this.status,
outputFiles
)
})
})

View file

@ -1136,10 +1136,10 @@ describe('ProjectController', function () {
this.ProjectController.loadEditor(this.req, this.res)
})
}
function expectPDFCachingEnabled() {
function expectPDFCachingEnabled(mode) {
it('should enable pdf caching', function (done) {
this.res.render = (pageName, opts) => {
expect(opts.enablePdfCaching).to.equal(true)
expect(opts.pdfCachingMode).to.equal(mode)
done()
}
this.ProjectController.loadEditor(this.req, this.res)
@ -1157,7 +1157,7 @@ describe('ProjectController', function () {
function expectPDFCachingDisabled() {
it('should disable pdf caching', function (done) {
this.res.render = (pageName, opts) => {
expect(opts.enablePdfCaching).to.equal(false)
expect(opts.pdfCachingMode).to.equal('')
done()
}
this.ProjectController.loadEditor(this.req, this.res)
@ -1174,20 +1174,32 @@ describe('ProjectController', function () {
expectPDFCachingDisabled()
})
describe('with enable_pdf_caching=false', function () {
describe('with track-pdf-download=false', function () {
beforeEach(function () {
this.req.query.enable_pdf_caching = 'false'
this.req.query['track-pdf-download'] = 'false'
})
expectBandwidthTrackingDisabled()
expectPDFCachingDisabled()
})
describe('with enable_pdf_caching=true', function () {
describe('with track-pdf-download=true', function () {
beforeEach(function () {
this.req.query.enable_pdf_caching = 'true'
this.req.query['track-pdf-download'] = 'true'
})
expectBandwidthTrackingEnabled()
expectPDFCachingEnabled()
})
describe('with pdf-caching-mode=no-service-worker', function () {
beforeEach(function () {
this.req.query['pdf-caching-mode'] = 'no-service-worker'
})
expectPDFCachingEnabled('no-service-worker')
})
describe('with pdf-caching-mode=service-worker', function () {
beforeEach(function () {
this.req.query['pdf-caching-mode'] = 'service-worker'
})
expectPDFCachingEnabled('service-worker')
})
})
})