diff --git a/services/clsi/app/js/ContentCacheManager.js b/services/clsi/app/js/ContentCacheManager.js index 099f0ee801..da40413651 100644 --- a/services/clsi/app/js/ContentCacheManager.js +++ b/services/clsi/app/js/ContentCacheManager.js @@ -10,6 +10,11 @@ const Settings = require('settings-sharelatex') const MIN_CHUNK_SIZE = Settings.pdfCachingMinChunkSize +const START_OF_STREAM_MARKER = 'stream' +const END_OF_STREAM_MARKER = 'endstream' +const START_OF_STREAM_MARKER_LENGTH = START_OF_STREAM_MARKER.length +const END_OF_STREAM_MARKER_LENGTH = END_OF_STREAM_MARKER.length + /** * * @param {String} contentDir path to directory where content hash files are cached @@ -41,15 +46,17 @@ class PdfStreamsExtractor { this.inStream = false this.streamStartIndex = 0 this.buffers = [] + this.lastChunk = Buffer.alloc(0) } consume(chunk) { let chunkIndex = 0 const pdfStreams = [] + chunk = Buffer.concat([this.lastChunk, chunk]) while (true) { if (!this.inStream) { // Not in a stream, look for stream start - const index = chunk.indexOf('stream', chunkIndex) + const index = chunk.indexOf(START_OF_STREAM_MARKER, chunkIndex) if (index === -1) { // Couldn't find stream start break @@ -60,13 +67,12 @@ class PdfStreamsExtractor { chunkIndex = index } else { // In a stream, look for stream end - const index = chunk.indexOf('endstream', chunkIndex) + const index = chunk.indexOf(END_OF_STREAM_MARKER, chunkIndex) if (index === -1) { - this.buffers.push(chunk.slice(chunkIndex)) break } // add "endstream" part - const endIndex = index + 9 + const endIndex = index + END_OF_STREAM_MARKER_LENGTH this.buffers.push(chunk.slice(chunkIndex, endIndex)) pdfStreams.push({ start: this.streamStartIndex, @@ -78,7 +84,22 @@ class PdfStreamsExtractor { chunkIndex = endIndex } } - this.fileIndex += chunk.length + + const remaining = chunk.length - chunkIndex + const nextMarkerLength = this.inStream + ? END_OF_STREAM_MARKER_LENGTH + : START_OF_STREAM_MARKER_LENGTH + if (remaining > nextMarkerLength) { + const retainMarkerSection = chunk.length - nextMarkerLength + if (this.inStream) { + this.buffers.push(chunk.slice(chunkIndex, retainMarkerSection)) + } + this.lastChunk = chunk.slice(retainMarkerSection) + this.fileIndex += retainMarkerSection + } else { + this.lastChunk = chunk.slice(chunkIndex) + this.fileIndex += chunkIndex + } return pdfStreams } } diff --git a/services/clsi/test/unit/js/ContentCacheManagerTests.js b/services/clsi/test/unit/js/ContentCacheManagerTests.js new file mode 100644 index 0000000000..90fe60c52c --- /dev/null +++ b/services/clsi/test/unit/js/ContentCacheManagerTests.js @@ -0,0 +1,160 @@ +const Path = require('path') +const crypto = require('crypto') +const { Readable } = require('stream') +const SandboxedModule = require('sandboxed-module') +const sinon = require('sinon') +const { expect } = require('chai') + +const MODULE_PATH = '../../../app/js/ContentCacheManager' + +class FakeFile { + constructor() { + this.closed = false + this.contents = [] + } + + async write(blob) { + this.contents.push(blob) + return this + } + + async close() { + this.closed = true + return this + } + + toJSON() { + return { + contents: Buffer.concat(this.contents).toString(), + closed: this.closed + } + } +} + +function hash(blob) { + const hash = crypto.createHash('sha256') + hash.update(blob) + return hash.digest('hex') +} + +describe('ContentCacheManager', function () { + let contentDir, pdfPath + let ContentCacheManager, fs, files, Settings + function load() { + ContentCacheManager = SandboxedModule.require(MODULE_PATH, { + requires: { + fs, + 'settings-sharelatex': Settings + } + }) + } + let contentRanges, newContentRanges + function run(filePath, done) { + ContentCacheManager.update(contentDir, filePath, (err, ranges) => { + if (err) return done(err) + ;[contentRanges, newContentRanges] = ranges + done() + }) + } + + beforeEach(function () { + contentDir = + '/app/output/602cee6f6460fca0ba7921e6/content/1797a7f48f9-5abc1998509dea1f' + pdfPath = + '/app/output/602cee6f6460fca0ba7921e6/generated-files/1797a7f48ea-8ac6805139f43351/output.pdf' + Settings = { + pdfCachingMinChunkSize: 1024, + enablePdfCachingDark: false + } + files = {} + fs = { + createReadStream: sinon.stub().returns(Readable.from([])), + promises: { + async open(name) { + files[name] = new FakeFile() + return files[name] + }, + async stat(name) { + if (!files[name]) { + throw new Error() + } + }, + rename: sinon.stub().resolves(), + unlink: sinon.stub().resolves() + } + } + }) + + describe('with a small minChunkSize', function () { + beforeEach(function () { + Settings.pdfCachingMinChunkSize = 1 + load() + }) + + describe('when the ranges are split across chunks', function () { + const RANGE_1 = 'stream123endstream' + const RANGE_2 = 'stream(|)endstream' + const RANGE_3 = 'stream!$%endstream' + beforeEach(function (done) { + fs.createReadStream + .withArgs(pdfPath) + .returns( + Readable.from([ + Buffer.from('abcstr'), + Buffer.from('eam123endstreamABC'), + Buffer.from('str'), + Buffer.from('eam(|'), + Buffer.from(')end'), + Buffer.from('stream-_~stream!$%endstream') + ]) + ) + run(pdfPath, done) + }) + + it('should produce three ranges', function () { + expect(contentRanges).to.have.length(3) + }) + + it('should find the correct offsets', function () { + expect(contentRanges).to.deep.equal([ + { + start: 3, + end: 21, + hash: hash(RANGE_1) + }, + { + start: 24, + end: 42, + hash: hash(RANGE_2) + }, + { + start: 45, + end: 63, + hash: hash(RANGE_3) + } + ]) + }) + + it('should store the contents', function () { + expect(JSON.parse(JSON.stringify(files))).to.deep.equal({ + [Path.join(contentDir, hash(RANGE_1))]: { + contents: RANGE_1, + closed: true + }, + [Path.join(contentDir, hash(RANGE_2))]: { + contents: RANGE_2, + closed: true + }, + [Path.join(contentDir, hash(RANGE_3))]: { + contents: RANGE_3, + closed: true + } + }) + }) + + it('should mark all ranges as new', function () { + expect(contentRanges).to.deep.equal(newContentRanges) + }) + }) + }) +})