Merge pull request #234 from overleaf/jpa-stream-detection-across-chunks

[ContentCacheManager] add support for stream detection across chunks
This commit is contained in:
Jakob Ackermann 2021-05-18 11:44:09 +02:00 committed by GitHub
commit 9b8763aed4
2 changed files with 186 additions and 5 deletions

View file

@ -10,6 +10,11 @@ const Settings = require('settings-sharelatex')
const MIN_CHUNK_SIZE = Settings.pdfCachingMinChunkSize
const START_OF_STREAM_MARKER = 'stream'
const END_OF_STREAM_MARKER = 'endstream'
const START_OF_STREAM_MARKER_LENGTH = START_OF_STREAM_MARKER.length
const END_OF_STREAM_MARKER_LENGTH = END_OF_STREAM_MARKER.length
/**
*
* @param {String} contentDir path to directory where content hash files are cached
@ -41,15 +46,17 @@ class PdfStreamsExtractor {
this.inStream = false
this.streamStartIndex = 0
this.buffers = []
this.lastChunk = Buffer.alloc(0)
}
consume(chunk) {
let chunkIndex = 0
const pdfStreams = []
chunk = Buffer.concat([this.lastChunk, chunk])
while (true) {
if (!this.inStream) {
// Not in a stream, look for stream start
const index = chunk.indexOf('stream', chunkIndex)
const index = chunk.indexOf(START_OF_STREAM_MARKER, chunkIndex)
if (index === -1) {
// Couldn't find stream start
break
@ -60,13 +67,12 @@ class PdfStreamsExtractor {
chunkIndex = index
} else {
// In a stream, look for stream end
const index = chunk.indexOf('endstream', chunkIndex)
const index = chunk.indexOf(END_OF_STREAM_MARKER, chunkIndex)
if (index === -1) {
this.buffers.push(chunk.slice(chunkIndex))
break
}
// add "endstream" part
const endIndex = index + 9
const endIndex = index + END_OF_STREAM_MARKER_LENGTH
this.buffers.push(chunk.slice(chunkIndex, endIndex))
pdfStreams.push({
start: this.streamStartIndex,
@ -78,7 +84,22 @@ class PdfStreamsExtractor {
chunkIndex = endIndex
}
}
this.fileIndex += chunk.length
const remaining = chunk.length - chunkIndex
const nextMarkerLength = this.inStream
? END_OF_STREAM_MARKER_LENGTH
: START_OF_STREAM_MARKER_LENGTH
if (remaining > nextMarkerLength) {
const retainMarkerSection = chunk.length - nextMarkerLength
if (this.inStream) {
this.buffers.push(chunk.slice(chunkIndex, retainMarkerSection))
}
this.lastChunk = chunk.slice(retainMarkerSection)
this.fileIndex += retainMarkerSection
} else {
this.lastChunk = chunk.slice(chunkIndex)
this.fileIndex += chunkIndex
}
return pdfStreams
}
}

View file

@ -0,0 +1,160 @@
const Path = require('path')
const crypto = require('crypto')
const { Readable } = require('stream')
const SandboxedModule = require('sandboxed-module')
const sinon = require('sinon')
const { expect } = require('chai')
const MODULE_PATH = '../../../app/js/ContentCacheManager'
class FakeFile {
constructor() {
this.closed = false
this.contents = []
}
async write(blob) {
this.contents.push(blob)
return this
}
async close() {
this.closed = true
return this
}
toJSON() {
return {
contents: Buffer.concat(this.contents).toString(),
closed: this.closed
}
}
}
function hash(blob) {
const hash = crypto.createHash('sha256')
hash.update(blob)
return hash.digest('hex')
}
describe('ContentCacheManager', function () {
let contentDir, pdfPath
let ContentCacheManager, fs, files, Settings
function load() {
ContentCacheManager = SandboxedModule.require(MODULE_PATH, {
requires: {
fs,
'settings-sharelatex': Settings
}
})
}
let contentRanges, newContentRanges
function run(filePath, done) {
ContentCacheManager.update(contentDir, filePath, (err, ranges) => {
if (err) return done(err)
;[contentRanges, newContentRanges] = ranges
done()
})
}
beforeEach(function () {
contentDir =
'/app/output/602cee6f6460fca0ba7921e6/content/1797a7f48f9-5abc1998509dea1f'
pdfPath =
'/app/output/602cee6f6460fca0ba7921e6/generated-files/1797a7f48ea-8ac6805139f43351/output.pdf'
Settings = {
pdfCachingMinChunkSize: 1024,
enablePdfCachingDark: false
}
files = {}
fs = {
createReadStream: sinon.stub().returns(Readable.from([])),
promises: {
async open(name) {
files[name] = new FakeFile()
return files[name]
},
async stat(name) {
if (!files[name]) {
throw new Error()
}
},
rename: sinon.stub().resolves(),
unlink: sinon.stub().resolves()
}
}
})
describe('with a small minChunkSize', function () {
beforeEach(function () {
Settings.pdfCachingMinChunkSize = 1
load()
})
describe('when the ranges are split across chunks', function () {
const RANGE_1 = 'stream123endstream'
const RANGE_2 = 'stream(|)endstream'
const RANGE_3 = 'stream!$%endstream'
beforeEach(function (done) {
fs.createReadStream
.withArgs(pdfPath)
.returns(
Readable.from([
Buffer.from('abcstr'),
Buffer.from('eam123endstreamABC'),
Buffer.from('str'),
Buffer.from('eam(|'),
Buffer.from(')end'),
Buffer.from('stream-_~stream!$%endstream')
])
)
run(pdfPath, done)
})
it('should produce three ranges', function () {
expect(contentRanges).to.have.length(3)
})
it('should find the correct offsets', function () {
expect(contentRanges).to.deep.equal([
{
start: 3,
end: 21,
hash: hash(RANGE_1)
},
{
start: 24,
end: 42,
hash: hash(RANGE_2)
},
{
start: 45,
end: 63,
hash: hash(RANGE_3)
}
])
})
it('should store the contents', function () {
expect(JSON.parse(JSON.stringify(files))).to.deep.equal({
[Path.join(contentDir, hash(RANGE_1))]: {
contents: RANGE_1,
closed: true
},
[Path.join(contentDir, hash(RANGE_2))]: {
contents: RANGE_2,
closed: true
},
[Path.join(contentDir, hash(RANGE_3))]: {
contents: RANGE_3,
closed: true
}
})
})
it('should mark all ranges as new', function () {
expect(contentRanges).to.deep.equal(newContentRanges)
})
})
})
})