Merge pull request #234 from overleaf/jpa-stream-detection-across-chunks

[ContentCacheManager] add support for stream detection across chunks
2025-04-14 17:33:34 +00:00 · 2021-05-18 11:44:09 +02:00 · 2021-05-18 11:44:09 +02:00 · 9b8763aed4
commit 9b8763aed4
parent 74e45c60f4 224ae0c254
2 changed files with 186 additions and 5 deletions
--- a/services/clsi/app/js/ContentCacheManager.js
+++ b/services/clsi/app/js/ContentCacheManager.js
@ -10,6 +10,11 @@ const Settings = require('settings-sharelatex')

 const MIN_CHUNK_SIZE = Settings.pdfCachingMinChunkSize

+const START_OF_STREAM_MARKER = 'stream'
+const END_OF_STREAM_MARKER = 'endstream'
+const START_OF_STREAM_MARKER_LENGTH = START_OF_STREAM_MARKER.length
+const END_OF_STREAM_MARKER_LENGTH = END_OF_STREAM_MARKER.length
+
 /**
 *
 * @param {String} contentDir path to directory where content hash files are cached
@ -41,15 +46,17 @@ class PdfStreamsExtractor {
    this.inStream = false
    this.streamStartIndex = 0
    this.buffers = []
+    this.lastChunk = Buffer.alloc(0)
  }

  consume(chunk) {
    let chunkIndex = 0
    const pdfStreams = []
+    chunk = Buffer.concat([this.lastChunk, chunk])
    while (true) {
      if (!this.inStream) {
        // Not in a stream, look for stream start
-        const index = chunk.indexOf('stream', chunkIndex)
+        const index = chunk.indexOf(START_OF_STREAM_MARKER, chunkIndex)
        if (index === -1) {
          // Couldn't find stream start
          break
@ -60,13 +67,12 @@ class PdfStreamsExtractor {
        chunkIndex = index
      } else {
        // In a stream, look for stream end
-        const index = chunk.indexOf('endstream', chunkIndex)
+        const index = chunk.indexOf(END_OF_STREAM_MARKER, chunkIndex)
        if (index === -1) {
-          this.buffers.push(chunk.slice(chunkIndex))
          break
        }
        // add "endstream" part
-        const endIndex = index + 9
+        const endIndex = index + END_OF_STREAM_MARKER_LENGTH
        this.buffers.push(chunk.slice(chunkIndex, endIndex))
        pdfStreams.push({
          start: this.streamStartIndex,
@ -78,7 +84,22 @@ class PdfStreamsExtractor {
        chunkIndex = endIndex
      }
    }
-    this.fileIndex += chunk.length
+
+    const remaining = chunk.length - chunkIndex
+    const nextMarkerLength = this.inStream
+      ? END_OF_STREAM_MARKER_LENGTH
+      : START_OF_STREAM_MARKER_LENGTH
+    if (remaining > nextMarkerLength) {
+      const retainMarkerSection = chunk.length - nextMarkerLength
+      if (this.inStream) {
+        this.buffers.push(chunk.slice(chunkIndex, retainMarkerSection))
+      }
+      this.lastChunk = chunk.slice(retainMarkerSection)
+      this.fileIndex += retainMarkerSection
+    } else {
+      this.lastChunk = chunk.slice(chunkIndex)
+      this.fileIndex += chunkIndex
+    }
    return pdfStreams
  }
 }
--- a/services/clsi/test/unit/js/ContentCacheManagerTests.js
+++ b/services/clsi/test/unit/js/ContentCacheManagerTests.js
@ -0,0 +1,160 @@
+const Path = require('path')
+const crypto = require('crypto')
+const { Readable } = require('stream')
+const SandboxedModule = require('sandboxed-module')
+const sinon = require('sinon')
+const { expect } = require('chai')
+
+const MODULE_PATH = '../../../app/js/ContentCacheManager'
+
+class FakeFile {
+  constructor() {
+    this.closed = false
+    this.contents = []
+  }
+
+  async write(blob) {
+    this.contents.push(blob)
+    return this
+  }
+
+  async close() {
+    this.closed = true
+    return this
+  }
+
+  toJSON() {
+    return {
+      contents: Buffer.concat(this.contents).toString(),
+      closed: this.closed
+    }
+  }
+}
+
+function hash(blob) {
+  const hash = crypto.createHash('sha256')
+  hash.update(blob)
+  return hash.digest('hex')
+}
+
+describe('ContentCacheManager', function () {
+  let contentDir, pdfPath
+  let ContentCacheManager, fs, files, Settings
+  function load() {
+    ContentCacheManager = SandboxedModule.require(MODULE_PATH, {
+      requires: {
+        fs,
+        'settings-sharelatex': Settings
+      }
+    })
+  }
+  let contentRanges, newContentRanges
+  function run(filePath, done) {
+    ContentCacheManager.update(contentDir, filePath, (err, ranges) => {
+      if (err) return done(err)
+      ;[contentRanges, newContentRanges] = ranges
+      done()
+    })
+  }
+
+  beforeEach(function () {
+    contentDir =
+      '/app/output/602cee6f6460fca0ba7921e6/content/1797a7f48f9-5abc1998509dea1f'
+    pdfPath =
+      '/app/output/602cee6f6460fca0ba7921e6/generated-files/1797a7f48ea-8ac6805139f43351/output.pdf'
+    Settings = {
+      pdfCachingMinChunkSize: 1024,
+      enablePdfCachingDark: false
+    }
+    files = {}
+    fs = {
+      createReadStream: sinon.stub().returns(Readable.from([])),
+      promises: {
+        async open(name) {
+          files[name] = new FakeFile()
+          return files[name]
+        },
+        async stat(name) {
+          if (!files[name]) {
+            throw new Error()
+          }
+        },
+        rename: sinon.stub().resolves(),
+        unlink: sinon.stub().resolves()
+      }
+    }
+  })
+
+  describe('with a small minChunkSize', function () {
+    beforeEach(function () {
+      Settings.pdfCachingMinChunkSize = 1
+      load()
+    })
+
+    describe('when the ranges are split across chunks', function () {
+      const RANGE_1 = 'stream123endstream'
+      const RANGE_2 = 'stream(|)endstream'
+      const RANGE_3 = 'stream!$%endstream'
+      beforeEach(function (done) {
+        fs.createReadStream
+          .withArgs(pdfPath)
+          .returns(
+            Readable.from([
+              Buffer.from('abcstr'),
+              Buffer.from('eam123endstreamABC'),
+              Buffer.from('str'),
+              Buffer.from('eam(|'),
+              Buffer.from(')end'),
+              Buffer.from('stream-_~stream!$%endstream')
+            ])
+          )
+        run(pdfPath, done)
+      })
+
+      it('should produce three ranges', function () {
+        expect(contentRanges).to.have.length(3)
+      })
+
+      it('should find the correct offsets', function () {
+        expect(contentRanges).to.deep.equal([
+          {
+            start: 3,
+            end: 21,
+            hash: hash(RANGE_1)
+          },
+          {
+            start: 24,
+            end: 42,
+            hash: hash(RANGE_2)
+          },
+          {
+            start: 45,
+            end: 63,
+            hash: hash(RANGE_3)
+          }
+        ])
+      })
+
+      it('should store the contents', function () {
+        expect(JSON.parse(JSON.stringify(files))).to.deep.equal({
+          [Path.join(contentDir, hash(RANGE_1))]: {
+            contents: RANGE_1,
+            closed: true
+          },
+          [Path.join(contentDir, hash(RANGE_2))]: {
+            contents: RANGE_2,
+            closed: true
+          },
+          [Path.join(contentDir, hash(RANGE_3))]: {
+            contents: RANGE_3,
+            closed: true
+          }
+        })
+      })
+
+      it('should mark all ranges as new', function () {
+        expect(contentRanges).to.deep.equal(newContentRanges)
+      })
+    })
+  })
+})