overleaf/services/clsi/app/js/ContentCacheManager.js

122 lines
3.3 KiB
JavaScript
Raw Normal View History

/**
* ContentCacheManager - maintains a cache of stream hashes from a PDF file
*/
const { callbackify } = require('util')
const fs = require('fs')
const crypto = require('crypto')
const Path = require('path')
const Settings = require('settings-sharelatex')
const MIN_CHUNK_SIZE = Settings.pdfCachingMinChunkSize
/**
*
* @param {String} contentDir path to directory where content hash files are cached
* @param {String} filePath the pdf file to scan for streams
*/
async function update(contentDir, filePath) {
const stream = fs.createReadStream(filePath)
const extractor = new PdfStreamsExtractor()
const ranges = []
const newRanges = []
for await (const chunk of stream) {
const pdfStreams = extractor.consume(chunk)
for (const pdfStream of pdfStreams) {
if (pdfStream.end - pdfStream.start < MIN_CHUNK_SIZE) continue
const hash = pdfStreamHash(pdfStream.buffers)
const range = { start: pdfStream.start, end: pdfStream.end, hash }
ranges.push(range)
if (await writePdfStream(contentDir, hash, pdfStream.buffers)) {
newRanges.push(range)
}
}
}
return [ranges, newRanges]
}
class PdfStreamsExtractor {
constructor() {
this.fileIndex = 0
this.inStream = false
this.streamStartIndex = 0
this.buffers = []
}
consume(chunk) {
let chunkIndex = 0
const pdfStreams = []
while (true) {
if (!this.inStream) {
// Not in a stream, look for stream start
const index = chunk.indexOf('stream', chunkIndex)
if (index === -1) {
// Couldn't find stream start
break
}
// Found stream start, start a stream
this.inStream = true
this.streamStartIndex = this.fileIndex + index
chunkIndex = index
} else {
// In a stream, look for stream end
const index = chunk.indexOf('endstream', chunkIndex)
if (index === -1) {
this.buffers.push(chunk.slice(chunkIndex))
break
}
// add "endstream" part
const endIndex = index + 9
this.buffers.push(chunk.slice(chunkIndex, endIndex))
pdfStreams.push({
start: this.streamStartIndex,
end: this.fileIndex + endIndex,
buffers: this.buffers
})
this.inStream = false
this.buffers = []
chunkIndex = endIndex
}
}
this.fileIndex += chunk.length
return pdfStreams
}
}
function pdfStreamHash(buffers) {
const hash = crypto.createHash('sha256')
for (const buffer of buffers) {
hash.update(buffer)
}
return hash.digest('hex')
}
async function writePdfStream(dir, hash, buffers) {
const filename = Path.join(dir, hash)
try {
await fs.promises.stat(filename)
// The file exists. Do not rewrite the content.
// It would change the modified-time of the file and hence invalidate the
// ETags used for client side caching via browser internals.
return false
} catch (e) {}
const file = await fs.promises.open(filename, 'w')
if (Settings.enablePdfCachingDark) {
// Write an empty file in dark mode.
buffers = []
}
try {
for (const buffer of buffers) {
await file.write(buffer)
}
} finally {
await file.close()
}
return true
}
module.exports = {
HASH_REGEX: /^[0-9a-f]{64}$/,
update: callbackify(update)
}