overleaf/services/web/frontend/js/features/pdf-preview/util/pdf-caching.js

491 lines
13 KiB
JavaScript
Raw Normal View History

import OError from '@overleaf/o-error'
const PDF_JS_CHUNK_SIZE = 128 * 1024
const MAX_SUB_REQUEST_COUNT = 4
const MAX_SUB_REQUEST_BYTES = 4 * PDF_JS_CHUNK_SIZE
const INCREMENTAL_CACHE_SIZE = 1000
const ENCODER = new TextEncoder()
function backfillEdgeBounds(file) {
if (!file.backfilledEdgeBoundsOnce) {
for (const range of file.ranges) {
if (range.objectId) {
range.objectId = ENCODER.encode(range.objectId)
range.start -= range.objectId.byteLength
}
}
}
file.backfilledEdgeBoundsOnce = true
}
/**
* @param {Array} chunks
*/
function countBytes(chunks) {
return chunks.reduce((totalBytes, chunk) => {
return totalBytes + (chunk.end - chunk.start)
}, 0)
}
/**
*
* @param {Object} metrics
* @param {number} size
* @param {number} cachedCount
* @param {number} cachedBytes
* @param {number} fetchedCount
* @param {number} fetchedBytes
*/
function trackDownloadStats(
metrics,
{ size, cachedCount, cachedBytes, fetchedCount, fetchedBytes }
) {
metrics.cachedCount += cachedCount
metrics.cachedBytes += cachedBytes
metrics.fetchedCount += fetchedCount
metrics.fetchedBytes += fetchedBytes
metrics.requestedCount++
metrics.requestedBytes += size
}
/**
* @param {Object} metrics
* @param {boolean} sizeDiffers
* @param {boolean} mismatch
* @param {boolean} success
*/
function trackChunkVerify(metrics, { sizeDiffers, mismatch, success }) {
if (sizeDiffers) {
metrics.chunkVerifySizeDiffers |= 0
metrics.chunkVerifySizeDiffers += 1
}
if (mismatch) {
metrics.chunkVerifyMismatch |= 0
metrics.chunkVerifyMismatch += 1
}
if (success) {
metrics.chunkVerifySuccess |= 0
metrics.chunkVerifySuccess += 1
}
}
/**
* @param chunk
* @param {ArrayBuffer} arrayBuffer
* @return {Uint8Array}
*/
function backFillObjectContext(chunk, arrayBuffer) {
if (!chunk.objectId) {
// This is a dynamic chunk
return new Uint8Array(arrayBuffer)
}
const { start, end, objectId } = chunk
const header = Uint8Array.from(objectId)
const fullBuffer = new Uint8Array(end - start)
fullBuffer.set(header, 0)
fullBuffer.set(new Uint8Array(arrayBuffer), objectId.length)
return fullBuffer
}
/**
* @param {Array} chunks
* @param {number} start
* @param {number} end
* @returns {Array}
*/
function getMatchingChunks(chunks, start, end) {
const matchingChunks = []
for (const chunk of chunks) {
if (chunk.end <= start) {
// no overlap:
// | REQUESTED_RANGE |
// | CHUNK |
continue
}
if (chunk.start >= end) {
// no overlap:
// | REQUESTED_RANGE |
// | CHUNK |
break
}
matchingChunks.push(chunk)
}
return matchingChunks
}
/**
* @param {Array} potentialChunks
* @param {Set} cached
* @param {Object} metrics
*/
function cutRequestAmplification(potentialChunks, cached, metrics) {
const chunks = []
const newChunks = []
let tooManyRequests = false
for (const chunk of potentialChunks) {
if (cached.has(chunk.hash)) {
chunks.push(chunk)
continue
}
if (newChunks.length < MAX_SUB_REQUEST_COUNT) {
chunks.push(chunk)
newChunks.push(chunk)
} else {
tooManyRequests = true
}
}
if (tooManyRequests) {
metrics.tooManyRequestsCount++
}
if (cached.size > INCREMENTAL_CACHE_SIZE) {
for (const key of cached) {
if (cached.size < INCREMENTAL_CACHE_SIZE) {
break
}
// Map keys are stored in insertion order.
// We re-insert keys on cache hit, 'cached' is a cheap LRU.
cached.delete(key)
}
}
return { chunks, newChunks }
}
/**
* @param {Array} chunks
* @param {number} start
* @param {number} end
* @returns {Array}
*/
function getInterleavingDynamicChunks(chunks, start, end) {
const dynamicChunks = []
for (const chunk of chunks) {
if (start < chunk.start) {
dynamicChunks.push({ start, end: chunk.start })
}
start = chunk.end
}
if (start < end) {
dynamicChunks.push({ start, end })
}
return dynamicChunks
}
/**
*
* @param {Response} response
*/
function getServerTime(response) {
const raw = response.headers.get('Date')
if (!raw) return new Date()
return new Date(raw)
}
/**
*
* @param {Response} response
*/
function getResponseSize(response) {
const raw = response.headers.get('Content-Length')
if (!raw) return 0
return parseInt(raw, 10)
}
/**
*
* @param {Response} response
* @param chunk
*/
function getMultipartBoundary(response, chunk) {
if (!Array.isArray(chunk)) return ''
const raw = response.headers.get('Content-Type')
if (raw.includes('multipart/byteranges')) {
const idx = raw.indexOf('boundary=')
if (idx !== -1) return raw.slice(idx + 'boundary='.length)
}
throw new OError('missing boundary on multipart request', {
headers: Object.fromEntries(response.headers.entries()),
chunk,
})
}
/**
* @param {Object} response
* @param {Object} file
* @param {Object} metrics
*/
function resolveMultiPartResponses(response, file, metrics) {
const { chunk: chunks, data, boundary } = response
if (!boundary) {
return [response]
}
const responses = []
let offsetStart = 0
for (const chunk of chunks) {
const header = `\r\n--${boundary}\r\nContent-Type: application/pdf\r\nContent-Range: bytes ${
chunk.start
}-${chunk.end - 1}/${file.size}\r\n\r\n`
const headerSize = header.length
// Verify header content. A proxy might have tampered with it.
const headerRaw = ENCODER.encode(header)
if (
!data
.subarray(offsetStart, offsetStart + headerSize)
.every((v, idx) => v === headerRaw[idx])
) {
metrics.headerVerifyFailure |= 0
metrics.headerVerifyFailure++
throw new OError('multipart response header does not match', {
actual: new TextDecoder().decode(
data.subarray(offsetStart, offsetStart + headerSize)
),
expected: header,
})
}
offsetStart += headerSize
const chunkSize = chunk.end - chunk.start
responses.push({
chunk,
data: data.subarray(offsetStart, offsetStart + chunkSize),
})
offsetStart += chunkSize
}
return responses
}
/**
*
* @param {Response} response
*/
function checkChunkResponse(response) {
if (!(response.status === 206 || response.status === 200)) {
throw new OError('non successful response status: ' + response.status)
}
}
/**
*
* @param {string} url
* @param {number} start
* @param {number} end
*/
export function fallbackRequest({ url, start, end }) {
return fetch(url, { headers: { Range: `bytes=${start}-${end - 1}` } }).then(
response => {
checkChunkResponse(response)
return response.arrayBuffer()
}
)
}
/**
*
* @param {string} url
* @param {number} start
* @param {number} end
* @param {Object} metrics
* @param {Uint8Array} actual
*/
async function verifyRange({ url, start, end, metrics, actual }) {
let expectedRaw
try {
expectedRaw = await fallbackRequest({ url, start, end })
} catch (error) {
throw OError.tag(error, 'cannot verify range', { url, start, end })
}
const expected = new Uint8Array(expectedRaw)
const stats = {}
if (actual.byteLength !== expected.byteLength) {
stats.sizeDiffers = true
} else if (!expected.every((v, idx) => v === actual[idx])) {
stats.mismatch = true
} else {
stats.success = true
}
trackChunkVerify(metrics, stats)
return expected
}
/**
*
* @param {string} url
* @param {number} start
* @param {number} end
* @param {Object} file
* @param {Object} metrics
* @param {Set} cached
*/
export async function fetchRange({ url, start, end, file, metrics, cached }) {
file.createdAt = new Date(file.createdAt)
backfillEdgeBounds(file)
// Check that handling the range request won't trigger excessive sub-requests,
// (to avoid unwanted latency compared to the original request).
const { chunks, newChunks } = cutRequestAmplification(
getMatchingChunks(file.ranges, start, end),
cached,
metrics
)
const dynamicChunks = getInterleavingDynamicChunks(chunks, start, end)
const chunksSize = countBytes(newChunks)
const size = end - start
if (chunks.length === 0 && dynamicChunks.length === 1) {
// fall back to the original range request when no chunks are cached.
trackDownloadStats(metrics, {
size,
cachedCount: 0,
cachedBytes: 0,
fetchedCount: 1,
fetchedBytes: size,
})
return fallbackRequest({ url, start, end })
}
if (
chunksSize > MAX_SUB_REQUEST_BYTES &&
!(dynamicChunks.length === 0 && newChunks.length <= 1)
) {
// fall back to the original range request when a very large amount of
// object data would be requested, unless it is the only object in the
// request or everything is already cached.
metrics.tooLargeOverheadCount++
trackDownloadStats(metrics, {
size,
cachedCount: 0,
cachedBytes: 0,
fetchedCount: 1,
fetchedBytes: size,
})
return fallbackRequest({ url, start, end })
}
const byteRanges = dynamicChunks
.map(chunk => `${chunk.start}-${chunk.end - 1}`)
.join(',')
const coalescedDynamicChunks = []
switch (dynamicChunks.length) {
case 0:
break
case 1:
coalescedDynamicChunks.push({
chunk: dynamicChunks[0],
url,
init: { headers: { Range: `bytes=${byteRanges}` } },
})
break
default:
coalescedDynamicChunks.push({
chunk: dynamicChunks,
url,
init: { headers: { Range: `bytes=${byteRanges}` } },
})
}
const params = new URL(url).searchParams
// drop no needed params
params.delete('enable_pdf_caching')
params.delete('verify_chunks')
const query = params.toString()
// The schema of `url` is https://domain/project/:id/user/:id/build/... for
// authenticated and https://domain/project/:id/build/... for
// unauthenticated users. Cut it before /build/.
// The path may have an optional /zone/b prefix too.
const perUserPrefix = url.slice(0, url.indexOf('/build/'))
const requests = chunks
.map(chunk => ({
chunk,
url: `${perUserPrefix}/content/${file.contentId}/${chunk.hash}?${query}`,
}))
.concat(coalescedDynamicChunks)
let cachedCount = 0
let cachedBytes = 0
let fetchedCount = 0
let fetchedBytes = 0
const reassembledBlob = new Uint8Array(size)
const rawResponses = await Promise.all(
requests.map(async ({ chunk, url, init }) => {
try {
const response = await fetch(url, init)
checkChunkResponse(response)
const boundary = getMultipartBoundary(response, chunk)
const blobFetchDate = getServerTime(response)
const blobSize = getResponseSize(response)
if (blobFetchDate && blobSize) {
// Example: 2MB PDF, 1MB image, 128KB PDF.js chunk.
// | pdf.js chunk |
// | A BIG IMAGE BLOB |
// | THE FULL PDF |
if (chunk.hash && blobFetchDate < file.createdAt) {
const usedChunkSection =
Math.min(end, chunk.end) - Math.max(start, chunk.start)
cachedCount++
cachedBytes += usedChunkSection
// Roll the position of the hash in the Map.
cached.delete(chunk.hash)
cached.add(chunk.hash)
} else {
// Blobs are fetched in bulk, record the full size.
fetchedCount++
fetchedBytes += blobSize
}
}
return {
boundary,
chunk,
data: backFillObjectContext(
chunk,
// response.arrayBuffer() yields the first multipart section only.
await (await response.blob()).arrayBuffer()
),
}
} catch (error) {
throw OError.tag(error, 'cannot fetch chunk', { url })
}
})
)
rawResponses
.flatMap(r => resolveMultiPartResponses(r, file, metrics))
.forEach(({ chunk, data }) => {
// overlap:
// | REQUESTED_RANGE |
// | CHUNK |
const offsetStart = Math.max(start - chunk.start, 0)
// overlap:
// | REQUESTED_RANGE |
// | CHUNK |
const offsetEnd = Math.max(chunk.end - end, 0)
if (offsetStart > 0 || offsetEnd > 0) {
// compute index positions for slice to handle case where offsetEnd=0
const chunkSize = chunk.end - chunk.start
data = data.subarray(offsetStart, chunkSize - offsetEnd)
}
const insertPosition = Math.max(chunk.start - start, 0)
reassembledBlob.set(data, insertPosition)
})
trackDownloadStats(metrics, {
size,
cachedCount,
cachedBytes,
fetchedCount,
fetchedBytes,
})
if (url.includes('verify_chunks=true')) {
return await verifyRange({
url,
start,
end,
metrics,
actual: reassembledBlob,
})
}
return reassembledBlob
}