overleaf/services/clsi/app/js/ContentCacheManager.js

/**
 * ContentCacheManager - maintains a cache of stream hashes from a PDF file
 */

const { callbackify } = require('util')
const fs = require('fs')
const crypto = require('crypto')
const Path = require('path')
const Settings = require('settings-sharelatex')

const MIN_CHUNK_SIZE = Settings.pdfCachingMinChunkSize

/**
 *
 * @param {String} contentDir path to directory where content hash files are cached
 * @param {String} filePath the pdf file to scan for streams
 */
async function update(contentDir, filePath) {
  const stream = fs.createReadStream(filePath)
  const extractor = new PdfStreamsExtractor()
  const ranges = []
  const newRanges = []
  for await (const chunk of stream) {
    const pdfStreams = extractor.consume(chunk)
    for (const pdfStream of pdfStreams) {
      if (pdfStream.end - pdfStream.start < MIN_CHUNK_SIZE) continue
      const hash = pdfStreamHash(pdfStream.buffers)
      const range = { start: pdfStream.start, end: pdfStream.end, hash }
      ranges.push(range)
      if (await writePdfStream(contentDir, hash, pdfStream.buffers)) {
        newRanges.push(range)
      }
    }
  }
  return [ranges, newRanges]
}

class PdfStreamsExtractor {
  constructor() {
    this.fileIndex = 0
    this.inStream = false
    this.streamStartIndex = 0
    this.buffers = []
  }

  consume(chunk) {
    let chunkIndex = 0
    const pdfStreams = []
    while (true) {
      if (!this.inStream) {
        // Not in a stream, look for stream start
        const index = chunk.indexOf('stream', chunkIndex)
        if (index === -1) {
          // Couldn't find stream start
          break
        }
        // Found stream start, start a stream
        this.inStream = true
        this.streamStartIndex = this.fileIndex + index
        chunkIndex = index
      } else {
        // In a stream, look for stream end
        const index = chunk.indexOf('endstream', chunkIndex)
        if (index === -1) {
          this.buffers.push(chunk.slice(chunkIndex))
          break
        }
        // add "endstream" part
        const endIndex = index + 9
        this.buffers.push(chunk.slice(chunkIndex, endIndex))
        pdfStreams.push({
          start: this.streamStartIndex,
          end: this.fileIndex + endIndex,
          buffers: this.buffers
        })
        this.inStream = false
        this.buffers = []
        chunkIndex = endIndex
      }
    }
    this.fileIndex += chunk.length
    return pdfStreams
  }
}

function pdfStreamHash(buffers) {
  const hash = crypto.createHash('sha256')
  for (const buffer of buffers) {
    hash.update(buffer)
  }
  return hash.digest('hex')
}

async function writePdfStream(dir, hash, buffers) {
  const filename = Path.join(dir, hash)
  try {
    await fs.promises.stat(filename)
    // The file exists. Do not rewrite the content.
    // It would change the modified-time of the file and hence invalidate the
    //  ETags used for client side caching via browser internals.
    return false
  } catch (e) {}
  const file = await fs.promises.open(filename, 'w')
  if (Settings.enablePdfCachingDark) {
    // Write an empty file in dark mode.
    buffers = []
  }
  try {
    for (const buffer of buffers) {
      await file.write(buffer)
    }
  } finally {
    await file.close()
  }
  return true
}

module.exports = {
  HASH_REGEX: /^[0-9a-f]{64}$/,
  update: callbackify(update)
}