Merge pull request #4184 from overleaf/jpa-incremental-caching

[perf] serviceWorker: fill the browser cache incrementally GitOrigin-RevId: 3e72838d7521f1adc63661e8e4235aaa104fef79
2025-04-13 15:17:07 +00:00 · 2021-06-14 10:11:37 +02:00 · 2021-06-14 10:11:37 +02:00 · c12fc1293d
commit c12fc1293d
parent a06031425b
1 changed files with 55 additions and 23 deletions
--- a/services/web/frontend/js/serviceWorker.js
+++ b/services/web/frontend/js/serviceWorker.js
@ -10,6 +10,7 @@ const PDF_REQUEST_MATCHER = /^\/project\/[0-9a-f]{24}\/.*\/output.pdf$/
 const PDF_JS_CHUNK_SIZE = 128 * 1024
 const MAX_SUBREQUEST_COUNT = 8
 const MAX_SUBREQUEST_BYTES = 4 * PDF_JS_CHUNK_SIZE
+const INCREMENTAL_CACHE_SIZE = 1000

 // Each compile request defines a context (essentially the specific pdf file for
 // that compile), requests for that pdf file can use the hashes in the compile
@ -23,6 +24,7 @@ const CLIENT_CONTEXT = new Map()
 function getClientContext(clientId) {
  let clientContext = CLIENT_CONTEXT.get(clientId)
  if (!clientContext) {
+    const cached = new Set()
    const pdfs = new Map()
    const metrics = {
      version: VERSION,
@ -39,7 +41,7 @@ function getClientContext(clientId) {
      requestedBytes: 0,
      compileCount: 0,
    }
-    clientContext = { pdfs, metrics }
+    clientContext = { pdfs, metrics, cached }
    CLIENT_CONTEXT.set(clientId, clientContext)
    // clean up old client maps
    expirePdfContexts()
@ -54,8 +56,9 @@ function getClientContext(clientId) {
 */
 function registerPdfContext(clientId, path, pdfContext) {
  const clientContext = getClientContext(clientId)
-  const { pdfs, metrics } = clientContext
+  const { pdfs, metrics, cached } = clientContext
  pdfContext.metrics = metrics
+  pdfContext.cached = cached
  // we only need to keep the last 3 contexts
  for (const key of pdfs.keys()) {
    if (pdfs.size < 3) {
@ -228,10 +231,11 @@ function handleProbeRequest(request, file) {
 * @param {string} compileGroup
 * @param {Date} pdfCreatedAt
 * @param {Object} metrics
+ * @param {Set} cached
 */
 function processPdfRequest(
  event,
-  { file, clsiServerId, compileGroup, pdfCreatedAt, metrics }
+  { file, clsiServerId, compileGroup, pdfCreatedAt, metrics, cached }
 ) {
  const response = handleProbeRequest(event.request, file)
  if (response) {
@ -249,9 +253,13 @@ function processPdfRequest(

  // Check that handling the range request won't trigger excessive subrequests,
  // (to avoid unwanted latency compared to the original request).
-  const chunks = getMatchingChunks(file.ranges, start, end)
+  const { chunks, newChunks } = cutRequestAmplification(
+    getMatchingChunks(file.ranges, start, end),
+    cached,
+    metrics
+  )
  const dynamicChunks = getInterleavingDynamicChunks(chunks, start, end)
-  const chunksSize = countBytes(chunks)
+  const chunksSize = countBytes(newChunks)
  const size = end - start

  if (chunks.length === 0 && dynamicChunks.length === 1) {
@ -265,29 +273,13 @@ function processPdfRequest(
    })
    return
  }
-  if (
-    chunks.length + (dynamicChunks.length > 0 ? 1 : 0) >
-    MAX_SUBREQUEST_COUNT
-  ) {
-    // fall back to the original range request when splitting the range creates
-    // too many subrequests.
-    metrics.tooManyRequestsCount++
-    trackDownloadStats(metrics, {
-      size,
-      cachedCount: 0,
-      cachedBytes: 0,
-      fetchedCount: 1,
-      fetchedBytes: size,
-    })
-    return
-  }
  if (
    chunksSize > MAX_SUBREQUEST_BYTES &&
-    !(dynamicChunks.length === 0 && chunks.length === 1)
+    !(dynamicChunks.length === 0 && newChunks.length <= 1)
  ) {
    // fall back to the original range request when a very large amount of
    // object data would be requested, unless it is the only object in the
-    // request.
+    // request or everything is already cached.
    metrics.tooLargeOverheadCount++
    trackDownloadStats(metrics, {
      size,
@ -370,6 +362,9 @@ function processPdfRequest(
              if (blobFetchDate < pdfCreatedAt) {
                cachedCount++
                cachedBytes += chunkSize
+                // Roll the position of the hash in the Map.
+                cached.delete(chunk.hash)
+                cached.add(chunk.hash)
              } else {
                // Blobs are fetched in bulk.
                fetchedCount++
@ -636,6 +631,43 @@ function getMatchingChunks(chunks, start, end) {
  return matchingChunks
 }

+/**
+ * @param {Array} potentialChunks
+ * @param {Set} cached
+ * @param {Object} metrics
+ */
+function cutRequestAmplification(potentialChunks, cached, metrics) {
+  const chunks = []
+  const newChunks = []
+  let tooManyRequests = false
+  for (const chunk of potentialChunks) {
+    if (cached.has(chunk.hash)) {
+      chunks.push(chunk)
+      continue
+    }
+    if (newChunks.length < MAX_SUBREQUEST_COUNT) {
+      chunks.push(chunk)
+      newChunks.push(chunk)
+    } else {
+      tooManyRequests = true
+    }
+  }
+  if (tooManyRequests) {
+    metrics.tooManyRequestsCount++
+  }
+  if (cached.size > INCREMENTAL_CACHE_SIZE) {
+    for (const key of cached) {
+      if (cached.size < INCREMENTAL_CACHE_SIZE) {
+        break
+      }
+      // Map keys are stored in insertion order.
+      // We re-insert keys on cache hit, 'cached' is a cheap LRU.
+      cached.delete(key)
+    }
+  }
+  return { chunks, newChunks }
+}
+
 /**
 * @param {Array} chunks
 * @param {number} start