Merge pull request #10274 from overleaf/jpa-pdf-caching-check-response-size

[web] pdf-caching: bailout early on unexpectedly large response size GitOrigin-RevId: b2b1cf0abf2367ec7a7f0f5ca7a9f67a03e2fa0d
2025-04-16 12:29:36 +00:00 · 2022-11-03 10:42:50 +00:00 · 2022-11-03 10:42:50 +00:00 · 986d52b8d4
commit 986d52b8d4
parent e7203ed0f8
1 changed files with 74 additions and 18 deletions
--- a/services/web/frontend/js/features/pdf-preview/util/pdf-caching.js
+++ b/services/web/frontend/js/features/pdf-preview/util/pdf-caching.js
@ -3,7 +3,14 @@ import OError from '@overleaf/o-error'
 const PDF_JS_CHUNK_SIZE = 128 * 1024
 const MAX_SUB_REQUEST_COUNT = 4
 const MAX_SUB_REQUEST_BYTES = 4 * PDF_JS_CHUNK_SIZE
-const HEADER_OVERHEAD_PER_MULTI_PART_CHUNK = 100
+const SAMPLE_NGINX_BOUNDARY = '00000000000000000001'
+const HEADER_OVERHEAD_PER_MULTI_PART_CHUNK = composeMultipartHeader({
+  boundary: SAMPLE_NGINX_BOUNDARY,
+  // Assume an upper bound of O(9GB) for the pdf size.
+  start: 9 * 1024 * 1024 * 1024,
+  end: 9 * 1024 * 1024 * 1024,
+  size: 9 * 1024 * 1024 * 1024,
+})
 const MULTI_PART_THRESHOLD = 4
 const INCREMENTAL_CACHE_SIZE = 1000
 // Download large chunks once the shard bandwidth exceeds 50% of their size.
@ -66,10 +73,24 @@ function preprocessFileOnce({ file, usageScore, cachedUrls }) {
 /**
 * @param {Array} chunks
 */
-function countBytes(chunks) {
-  return chunks.reduce((totalBytes, chunk) => {
-    return totalBytes + (chunk.end - chunk.start)
-  }, 0)
+function estimateSizeOfMultipartResponse(chunks) {
+  /*
+  --boundary
+  HEADER
+  BLOB
+  --boundary
+  HEADER
+  BLOB
+  --boundary--
+   */
+  return (
+    chunks.reduce(
+      (totalBytes, chunk) =>
+        totalBytes +
+        HEADER_OVERHEAD_PER_MULTI_PART_CHUNK +
+        (chunk.end - chunk.start)
+    ) + ('\r\n' + SAMPLE_NGINX_BOUNDARY + '--').length
+  )
 }

 /**
@ -350,6 +371,19 @@ function getMultipartBoundary(response, chunk) {
  })
 }

+/**
+ * @param {string} boundary
+ * @param {number} start
+ * @param {number} end
+ * @param {number} size
+ * @return {string}
+ */
+function composeMultipartHeader({ boundary, start, end, size }) {
+  return `\r\n--${boundary}\r\nContent-Type: application/pdf\r\nContent-Range: bytes ${start}-${
+    end - 1
+  }/${size}\r\n\r\n`
+}
+
 /**
 * @param {Object} file
 * @param {Array} chunks
@ -362,9 +396,12 @@ function resolveMultiPartResponses({ file, chunks, data, boundary, metrics }) {
  let offsetStart = 0
  const encoder = new TextEncoder()
  for (const chunk of chunks) {
-    const header = `\r\n--${boundary}\r\nContent-Type: application/pdf\r\nContent-Range: bytes ${
-      chunk.start
-    }-${chunk.end - 1}/${file.size}\r\n\r\n`
+    const header = composeMultipartHeader({
+      boundary,
+      start: chunk.start,
+      end: chunk.end,
+      size: file.size,
+    })
    const headerSize = header.length

    // Verify header content. A proxy might have tampered with it.
@ -398,11 +435,28 @@ function resolveMultiPartResponses({ file, chunks, data, boundary, metrics }) {
 /**
 *
 * @param {Response} response
+ * @param {number} estimatedSize
+ * @param {RequestInit} init
 */
-function checkChunkResponse(response) {
+function checkChunkResponse(response, estimatedSize, init) {
  if (!(response.status === 206 || response.status === 200)) {
    throw new OError('non successful response status: ' + response.status)
  }
+  const responseSize = getResponseSize(response)
+  if (!responseSize) {
+    throw new OError('content-length response header missing', {
+      responseHeaders: Object.fromEntries(response.headers.entries()),
+      requestHeader: init.headers,
+    })
+  }
+  if (responseSize > estimatedSize) {
+    throw new OError('response size exceeds estimate', {
+      estimatedSize,
+      responseSize,
+      responseHeaders: Object.fromEntries(response.headers.entries()),
+      requestHeader: init.headers,
+    })
+  }
 }

 /**
@ -414,11 +468,12 @@ function checkChunkResponse(response) {
 */
 export async function fallbackRequest({ url, start, end, abortSignal }) {
  try {
-    const response = await fetch(url, {
+    const init = {
      headers: { Range: `bytes=${start}-${end - 1}` },
      signal: abortSignal,
-    })
-    checkChunkResponse(response)
+    }
+    const response = await fetch(url, init)
+    checkChunkResponse(response, end - start, init)
    return await response.arrayBuffer()
  } catch (e) {
    throw OError.tag(e, 'fallback request failed', { url, start, end })
@ -471,9 +526,9 @@ function skipPrefetched(chunks, prefetched, start, end) {
 }

 /**
- * @param {Object} chunk
+ * @param {Object|Object[]} chunk
 * @param {string} url
- * @param {Object} init
+ * @param {RequestInit} init
 * @param {Map<string, string>} cachedUrls
 * @param {Object} metrics
 * @param {boolean} cachedUrlLookupEnabled
@ -507,7 +562,10 @@ async function fetchChunk({
    }
  }
  const response = await fetch(url, init)
-  checkChunkResponse(response)
+  const estimatedSize = Array.isArray(chunk)
+    ? estimateSizeOfMultipartResponse(chunk)
+    : chunk.end - chunk.start
+  checkChunkResponse(response, estimatedSize, init)
  if (chunk.hash) cachedUrls.set(chunk.hash, url)
  return response
 }
@ -565,9 +623,7 @@ function addPrefetchingChunks({
    )
  )

-  let sum =
-    countBytes(dynamicChunks) +
-    HEADER_OVERHEAD_PER_MULTI_PART_CHUNK * dynamicChunks.length
+  let sum = estimateSizeOfMultipartResponse(dynamicChunks)
  for (const chunk of extraChunks) {
    const downloadSize =
      chunk.end - chunk.start + HEADER_OVERHEAD_PER_MULTI_PART_CHUNK