2021-05-13 09:07:54 -04:00
|
|
|
/**
|
|
|
|
* ContentCacheManager - maintains a cache of stream hashes from a PDF file
|
|
|
|
*/
|
|
|
|
|
|
|
|
const { callbackify } = require('util')
|
|
|
|
const fs = require('fs')
|
|
|
|
const crypto = require('crypto')
|
|
|
|
const Path = require('path')
|
|
|
|
const Settings = require('settings-sharelatex')
|
|
|
|
|
|
|
|
const MIN_CHUNK_SIZE = Settings.pdfCachingMinChunkSize
|
|
|
|
|
2021-05-17 09:07:37 -04:00
|
|
|
const START_OF_STREAM_MARKER = 'stream'
|
|
|
|
const END_OF_STREAM_MARKER = 'endstream'
|
|
|
|
const START_OF_STREAM_MARKER_LENGTH = START_OF_STREAM_MARKER.length
|
|
|
|
const END_OF_STREAM_MARKER_LENGTH = END_OF_STREAM_MARKER.length
|
|
|
|
|
2021-05-13 09:07:54 -04:00
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @param {String} contentDir path to directory where content hash files are cached
|
|
|
|
* @param {String} filePath the pdf file to scan for streams
|
|
|
|
*/
|
|
|
|
async function update(contentDir, filePath) {
|
|
|
|
const stream = fs.createReadStream(filePath)
|
|
|
|
const extractor = new PdfStreamsExtractor()
|
|
|
|
const ranges = []
|
|
|
|
const newRanges = []
|
2021-05-18 04:50:13 -04:00
|
|
|
const seenHashes = new Set()
|
2021-05-13 09:07:54 -04:00
|
|
|
for await (const chunk of stream) {
|
|
|
|
const pdfStreams = extractor.consume(chunk)
|
|
|
|
for (const pdfStream of pdfStreams) {
|
|
|
|
if (pdfStream.end - pdfStream.start < MIN_CHUNK_SIZE) continue
|
|
|
|
const hash = pdfStreamHash(pdfStream.buffers)
|
2021-05-18 04:50:13 -04:00
|
|
|
|
2021-05-13 09:07:54 -04:00
|
|
|
const range = { start: pdfStream.start, end: pdfStream.end, hash }
|
|
|
|
ranges.push(range)
|
2021-05-18 04:50:13 -04:00
|
|
|
|
|
|
|
// Optimization: Skip writing of duplicate streams.
|
|
|
|
if (seenHashes.has(hash)) continue
|
|
|
|
seenHashes.add(hash)
|
|
|
|
|
2021-05-13 09:07:54 -04:00
|
|
|
if (await writePdfStream(contentDir, hash, pdfStream.buffers)) {
|
|
|
|
newRanges.push(range)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return [ranges, newRanges]
|
|
|
|
}
|
|
|
|
|
|
|
|
class PdfStreamsExtractor {
|
|
|
|
constructor() {
|
|
|
|
this.fileIndex = 0
|
|
|
|
this.inStream = false
|
|
|
|
this.streamStartIndex = 0
|
|
|
|
this.buffers = []
|
2021-05-17 09:07:37 -04:00
|
|
|
this.lastChunk = Buffer.alloc(0)
|
2021-05-13 09:07:54 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
consume(chunk) {
|
|
|
|
let chunkIndex = 0
|
|
|
|
const pdfStreams = []
|
2021-05-17 09:07:37 -04:00
|
|
|
chunk = Buffer.concat([this.lastChunk, chunk])
|
2021-05-13 09:07:54 -04:00
|
|
|
while (true) {
|
|
|
|
if (!this.inStream) {
|
|
|
|
// Not in a stream, look for stream start
|
2021-05-17 09:07:37 -04:00
|
|
|
const index = chunk.indexOf(START_OF_STREAM_MARKER, chunkIndex)
|
2021-05-13 09:07:54 -04:00
|
|
|
if (index === -1) {
|
|
|
|
// Couldn't find stream start
|
|
|
|
break
|
|
|
|
}
|
|
|
|
// Found stream start, start a stream
|
|
|
|
this.inStream = true
|
|
|
|
this.streamStartIndex = this.fileIndex + index
|
|
|
|
chunkIndex = index
|
|
|
|
} else {
|
|
|
|
// In a stream, look for stream end
|
2021-05-17 09:07:37 -04:00
|
|
|
const index = chunk.indexOf(END_OF_STREAM_MARKER, chunkIndex)
|
2021-05-13 09:07:54 -04:00
|
|
|
if (index === -1) {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
// add "endstream" part
|
2021-05-17 09:07:37 -04:00
|
|
|
const endIndex = index + END_OF_STREAM_MARKER_LENGTH
|
2021-05-13 09:07:54 -04:00
|
|
|
this.buffers.push(chunk.slice(chunkIndex, endIndex))
|
|
|
|
pdfStreams.push({
|
|
|
|
start: this.streamStartIndex,
|
|
|
|
end: this.fileIndex + endIndex,
|
|
|
|
buffers: this.buffers
|
|
|
|
})
|
|
|
|
this.inStream = false
|
|
|
|
this.buffers = []
|
|
|
|
chunkIndex = endIndex
|
|
|
|
}
|
|
|
|
}
|
2021-05-17 09:07:37 -04:00
|
|
|
|
|
|
|
const remaining = chunk.length - chunkIndex
|
|
|
|
const nextMarkerLength = this.inStream
|
|
|
|
? END_OF_STREAM_MARKER_LENGTH
|
|
|
|
: START_OF_STREAM_MARKER_LENGTH
|
|
|
|
if (remaining > nextMarkerLength) {
|
|
|
|
const retainMarkerSection = chunk.length - nextMarkerLength
|
|
|
|
if (this.inStream) {
|
|
|
|
this.buffers.push(chunk.slice(chunkIndex, retainMarkerSection))
|
|
|
|
}
|
|
|
|
this.lastChunk = chunk.slice(retainMarkerSection)
|
|
|
|
this.fileIndex += retainMarkerSection
|
|
|
|
} else {
|
|
|
|
this.lastChunk = chunk.slice(chunkIndex)
|
|
|
|
this.fileIndex += chunkIndex
|
|
|
|
}
|
2021-05-13 09:07:54 -04:00
|
|
|
return pdfStreams
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
function pdfStreamHash(buffers) {
|
|
|
|
const hash = crypto.createHash('sha256')
|
|
|
|
for (const buffer of buffers) {
|
|
|
|
hash.update(buffer)
|
|
|
|
}
|
|
|
|
return hash.digest('hex')
|
|
|
|
}
|
|
|
|
|
|
|
|
async function writePdfStream(dir, hash, buffers) {
|
|
|
|
const filename = Path.join(dir, hash)
|
|
|
|
try {
|
|
|
|
await fs.promises.stat(filename)
|
|
|
|
// The file exists. Do not rewrite the content.
|
|
|
|
// It would change the modified-time of the file and hence invalidate the
|
|
|
|
// ETags used for client side caching via browser internals.
|
|
|
|
return false
|
|
|
|
} catch (e) {}
|
2021-05-17 09:18:07 -04:00
|
|
|
const atomicWriteFilename = filename + '~'
|
|
|
|
const file = await fs.promises.open(atomicWriteFilename, 'w')
|
2021-05-13 09:07:54 -04:00
|
|
|
if (Settings.enablePdfCachingDark) {
|
|
|
|
// Write an empty file in dark mode.
|
|
|
|
buffers = []
|
|
|
|
}
|
|
|
|
try {
|
2021-05-17 09:18:07 -04:00
|
|
|
try {
|
|
|
|
for (const buffer of buffers) {
|
|
|
|
await file.write(buffer)
|
|
|
|
}
|
|
|
|
} finally {
|
|
|
|
await file.close()
|
|
|
|
}
|
|
|
|
await fs.promises.rename(atomicWriteFilename, filename)
|
|
|
|
} catch (err) {
|
|
|
|
try {
|
|
|
|
await fs.promises.unlink(atomicWriteFilename)
|
|
|
|
} catch (_) {
|
|
|
|
throw err
|
2021-05-13 09:07:54 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2021-05-13 09:56:15 -04:00
|
|
|
module.exports = {
|
|
|
|
HASH_REGEX: /^[0-9a-f]{64}$/,
|
|
|
|
update: callbackify(update)
|
|
|
|
}
|