overleaf/services/clsi/app/js/ContentCacheManager.js

/**
 * ContentCacheManager - maintains a cache of stream hashes from a PDF file
 */

const { callbackify } = require('util')
const fs = require('fs')
const crypto = require('crypto')
const Path = require('path')
const Settings = require('settings-sharelatex')

const MIN_CHUNK_SIZE = Settings.pdfCachingMinChunkSize

/**
 *
 * @param {String} contentDir path to directory where content hash files are cached
 * @param {String} filePath the pdf file to scan for streams
 */
async function update(contentDir, filePath) {
  const stream = fs.createReadStream(filePath)
  const extractor = new PdfStreamsExtractor()
  const ranges = []
  const newRanges = []
  for await (const chunk of stream) {
    const pdfStreams = extractor.consume(chunk)
    for (const pdfStream of pdfStreams) {
      if (pdfStream.end - pdfStream.start < MIN_CHUNK_SIZE) continue
      const hash = pdfStreamHash(pdfStream.buffers)
      const range = { start: pdfStream.start, end: pdfStream.end, hash }
      ranges.push(range)
      if (await writePdfStream(contentDir, hash, pdfStream.buffers)) {
        newRanges.push(range)
      }
    }
  }
  return [ranges, newRanges]
}

class PdfStreamsExtractor {
  constructor() {
    this.fileIndex = 0
    this.inStream = false
    this.streamStartIndex = 0
    this.buffers = []
  }

  consume(chunk) {
    let chunkIndex = 0
    const pdfStreams = []
    while (true) {
      if (!this.inStream) {
        // Not in a stream, look for stream start
        const index = chunk.indexOf('stream', chunkIndex)
        if (index === -1) {
          // Couldn't find stream start
          break
        }
        // Found stream start, start a stream
        this.inStream = true
        this.streamStartIndex = this.fileIndex + index
        chunkIndex = index
      } else {
        // In a stream, look for stream end
        const index = chunk.indexOf('endstream', chunkIndex)
        if (index === -1) {
          this.buffers.push(chunk.slice(chunkIndex))
          break
        }
        // add "endstream" part
        const endIndex = index + 9
        this.buffers.push(chunk.slice(chunkIndex, endIndex))
        pdfStreams.push({
          start: this.streamStartIndex,
          end: this.fileIndex + endIndex,
          buffers: this.buffers
        })
        this.inStream = false
        this.buffers = []
        chunkIndex = endIndex
      }
    }
    this.fileIndex += chunk.length
    return pdfStreams
  }
}

function pdfStreamHash(buffers) {
  const hash = crypto.createHash('sha256')
  for (const buffer of buffers) {
    hash.update(buffer)
  }
  return hash.digest('hex')
}

async function writePdfStream(dir, hash, buffers) {
  const filename = Path.join(dir, hash)
  try {
    await fs.promises.stat(filename)
    // The file exists. Do not rewrite the content.
    // It would change the modified-time of the file and hence invalidate the
    //  ETags used for client side caching via browser internals.
    return false
  } catch (e) {}
  const file = await fs.promises.open(filename, 'w')
  if (Settings.enablePdfCachingDark) {
    // Write an empty file in dark mode.
    buffers = []
  }
  try {
    for (const buffer of buffers) {
      await file.write(buffer)
    }
  } finally {
    await file.close()
  }
  return true
}

module.exports = {
  HASH_REGEX: /^[0-9a-f]{64}$/,
  update: callbackify(update)
}
[misc] merge pdf caching into main (#226) * wip generate directory for hash content * cleanup, remove console logging * add content caching module * Return PDF stream ranges with compile response * Return the PDF file size in the compile response * PDF range endpoint * [misc] WIP: pdf caching: preserve the m-time on static content files * [misc] WIP: pdf caching: improve browser caching, emit caching headers * [misc] WIP: pdf caching: do not emit very small chunks <1kB * [misc] keep up with moving output files into a separate directory * [OutputCacheManager] add global feature flag for enabling pdf caching * [misc] add contentId into the URL for protecting PDF stream contents * [misc] support PDF stream caching for anonymous users * [misc] add per-request feature flag for enabling PDF stream caching * [misc] enable pdf caching in CI and emit metrics at the end of run * [misc] expose compile stats and timings to the frontend * [misc] log an error in case saving output files fails * [misc] add metrics for pdf bandwidth and pdf caching performance * [misc] add a dark mode to the pdf caching for computing ranges only * [misc] move pdf caching metrics into ContentCacheMetrics * [misc] add a config option for the min chunk size of pdf ranges Co-authored-by: Brian Gough <brian.gough@overleaf.com> Co-authored-by: Eric Mc Sween <eric.mcsween@overleaf.com> 2021-05-13 13:07:54 +00:00			`/**`
			`* ContentCacheManager - maintains a cache of stream hashes from a PDF file`
			`*/`

			`const { callbackify } = require('util')`
			`const fs = require('fs')`
			`const crypto = require('crypto')`
			`const Path = require('path')`
			`const Settings = require('settings-sharelatex')`

			`const MIN_CHUNK_SIZE = Settings.pdfCachingMinChunkSize`

			`/**`
			`*`
			`* @param {String} contentDir path to directory where content hash files are cached`
			`* @param {String} filePath the pdf file to scan for streams`
			`*/`
			`async function update(contentDir, filePath) {`
			`const stream = fs.createReadStream(filePath)`
			`const extractor = new PdfStreamsExtractor()`
			`const ranges = []`
			`const newRanges = []`
			`for await (const chunk of stream) {`
			`const pdfStreams = extractor.consume(chunk)`
			`for (const pdfStream of pdfStreams) {`
			`if (pdfStream.end - pdfStream.start < MIN_CHUNK_SIZE) continue`
			`const hash = pdfStreamHash(pdfStream.buffers)`
			`const range = { start: pdfStream.start, end: pdfStream.end, hash }`
			`ranges.push(range)`
			`if (await writePdfStream(contentDir, hash, pdfStream.buffers)) {`
			`newRanges.push(range)`
			`}`
			`}`
			`}`
			`return [ranges, newRanges]`
			`}`

			`class PdfStreamsExtractor {`
			`constructor() {`
			`this.fileIndex = 0`
			`this.inStream = false`
			`this.streamStartIndex = 0`
			`this.buffers = []`
			`}`

			`consume(chunk) {`
			`let chunkIndex = 0`
			`const pdfStreams = []`
			`while (true) {`
			`if (!this.inStream) {`
			`// Not in a stream, look for stream start`
			`const index = chunk.indexOf('stream', chunkIndex)`
			`if (index === -1) {`
			`// Couldn't find stream start`
			`break`
			`}`
			`// Found stream start, start a stream`
			`this.inStream = true`
			`this.streamStartIndex = this.fileIndex + index`
			`chunkIndex = index`
			`} else {`
			`// In a stream, look for stream end`
			`const index = chunk.indexOf('endstream', chunkIndex)`
			`if (index === -1) {`
			`this.buffers.push(chunk.slice(chunkIndex))`
			`break`
			`}`
			`// add "endstream" part`
			`const endIndex = index + 9`
			`this.buffers.push(chunk.slice(chunkIndex, endIndex))`
			`pdfStreams.push({`
			`start: this.streamStartIndex,`
			`end: this.fileIndex + endIndex,`
			`buffers: this.buffers`
			`})`
			`this.inStream = false`
			`this.buffers = []`
			`chunkIndex = endIndex`
			`}`
			`}`
			`this.fileIndex += chunk.length`
			`return pdfStreams`
			`}`
			`}`

			`function pdfStreamHash(buffers) {`
			`const hash = crypto.createHash('sha256')`
			`for (const buffer of buffers) {`
			`hash.update(buffer)`
			`}`
			`return hash.digest('hex')`
			`}`

			`async function writePdfStream(dir, hash, buffers) {`
			`const filename = Path.join(dir, hash)`
			`try {`
			`await fs.promises.stat(filename)`
			`// The file exists. Do not rewrite the content.`
			`// It would change the modified-time of the file and hence invalidate the`
			`// ETags used for client side caching via browser internals.`
			`return false`
			`} catch (e) {}`
			`const file = await fs.promises.open(filename, 'w')`
			`if (Settings.enablePdfCachingDark) {`
			`// Write an empty file in dark mode.`
			`buffers = []`
			`}`
			`try {`
			`for (const buffer of buffers) {`
			`await file.write(buffer)`
			`}`
			`} finally {`
			`await file.close()`
			`}`
			`return true`
			`}`

add validation for express :content_id parameter 2021-05-13 13:56:15 +00:00			`module.exports = {`
			`HASH_REGEX: /^[0-9a-f]{64}$/,`
			`update: callbackify(update)`
			`}`