overleaf/services/clsi/app/lib/pdfjs/FSStream.js

const { Stream } = require('pdfjs-dist/lib/core/stream')
const { MissingDataException } = require('pdfjs-dist/lib/core/core_utils')

const BUF_SIZE = 1024 // read from the file in 1024 byte pages

class FSStream extends Stream {
  constructor(fh, start, length, dict, cachedBytes, checkDeadline) {
    const nonEmptyDummyBuffer = Buffer.alloc(1, 0)
    super(nonEmptyDummyBuffer, start, length, dict)
    delete this.bytes
    this.fh = fh
    this.checkDeadline = checkDeadline
    this.cachedBytes = cachedBytes || []
  }

  get length() {
    return this.end - this.start
  }

  get isEmpty() {
    return this.length === 0
  }

  // Manage cached reads from the file

  requestRange(begin, end) {
    this.checkDeadline(`request range ${begin} - ${end}`)
    // expand small ranges to read a larger amount
    if (end - begin < BUF_SIZE) {
      end = begin + BUF_SIZE
    }
    end = Math.min(end, this.length)
    // keep a cache of previous reads with {begin,end,buffer} values
    const result = {
      begin: begin,
      end: end,
      buffer: Buffer.alloc(end - begin, 0),
    }
    this.cachedBytes.push(result)
    return this.fh.read(result.buffer, 0, end - begin, begin)
  }

  _ensureGetPos(pos) {
    const found = this.cachedBytes.find(x => {
      return x.begin <= pos && pos < x.end
    })
    if (!found) {
      throw new MissingDataException(pos, pos + 1)
    }
    return found
  }

  _ensureGetRange(begin, end) {
    end = Math.min(end, this.length) // BG: handle overflow case
    const found = this.cachedBytes.find(x => {
      return x.begin <= begin && end <= x.end
    })
    if (!found) {
      throw new MissingDataException(begin, end)
    }
    return found
  }

  _readByte(found, pos) {
    return found.buffer[pos - found.begin]
  }

  _readBytes(found, pos, end) {
    return found.buffer.subarray(pos - found.begin, end - found.begin)
  }

  // handle accesses to the bytes

  ensureByte(pos) {
    this._ensureGetPos(pos) // may throw a MissingDataException
  }

  getByte() {
    const pos = this.pos
    if (this.pos >= this.end) {
      return -1
    }
    const found = this._ensureGetPos(pos)
    return this._readByte(found, this.pos++)
  }

  // BG: for a range, end is not included (see Buffer.subarray for example)

  ensureBytes(length, forceClamped = false) {
    const pos = this.pos
    this._ensureGetRange(pos, pos + length)
  }

  getBytes(length, forceClamped = false) {
    const pos = this.pos
    const strEnd = this.end

    const found = this._ensureGetRange(pos, pos + length)
    if (!length) {
      const subarray = this._readBytes(found, pos, strEnd)
      // `this.bytes` is always a `Uint8Array` here.
      return forceClamped ? new Uint8ClampedArray(subarray) : subarray
    }
    let end = pos + length
    if (end > strEnd) {
      end = strEnd
    }
    this.pos = end
    const subarray = this._readBytes(found, pos, end)
    // `this.bytes` is always a `Uint8Array` here.
    return forceClamped ? new Uint8ClampedArray(subarray) : subarray
  }

  getByteRange() {
    // BG: this isn't needed as far as I can tell
    throw new Error('not implemented')
  }

  reset() {
    this.pos = this.start
  }

  moveStart() {
    this.start = this.pos
  }

  makeSubStream(start, length, dict = null) {
    this.checkDeadline(`make sub stream start=${start}/length=${length}`)
    // BG: had to add this check for null length, it is being called with only
    // the start value at one point in the xref decoding. The intent is clear
    // enough
    // - a null length means "to the end of the file" -- not sure how it is
    //   working in the existing pdfjs code without this.
    if (!length) {
      length = this.end - start
    }
    return new FSStream(
      this.fh,
      start,
      length,
      dict,
      this.cachedBytes,
      this.checkDeadline
    )
  }
}

module.exports = { FSStream }
[ContentCacheManager] use PDF.js Xref table instead of stream detection (#242) * make the content cache manager tests configurable * extend stream content in unit tests * [ContentCacheManagerTests] prepare for full object caching * filesystem stream for pdfjs * working?? * cleaning up * handle overflow * [misc] install pdfjs-dist * [misc] move pdfjs code into app/lib/ and scripts/, also use CamelCase * [misc] abstract the file loading and parsing of xRef tables into helper * [misc] pdfjsTests: add snapshot based tests for the Xref table parser * [misc] FSStream: throw proper error and drop commented code * [misc] FSStream: integrate throwing of MissingDataException into getter * [misc] pdfjs: fix eslint errors * [misc] pdfjs: run format_fix * [misc] pdfjs: allocate very small non empty dummy buffers explicitly * [misc] install @overleaf/o-error * [ContentCacheManager] use PDF.js Xref table instead of stream detection Co-Authored-By: Brian Gough <brian.gough@overleaf.com> * [pdfjs] parseXrefTable: handle empty PDF files gracefully Co-authored-by: Brian Gough <brian.gough@overleaf.com> 2021-05-31 04:20:25 -04:00			`const { Stream } = require('pdfjs-dist/lib/core/stream')`
			`const { MissingDataException } = require('pdfjs-dist/lib/core/core_utils')`

			`const BUF_SIZE = 1024 // read from the file in 1024 byte pages`

			`class FSStream extends Stream {`
[misc] bail out from pdf caching processing after 10s or earlier ...for fast compiles. 2021-06-23 09:14:28 -04:00			`constructor(fh, start, length, dict, cachedBytes, checkDeadline) {`
[ContentCacheManager] use PDF.js Xref table instead of stream detection (#242) * make the content cache manager tests configurable * extend stream content in unit tests * [ContentCacheManagerTests] prepare for full object caching * filesystem stream for pdfjs * working?? * cleaning up * handle overflow * [misc] install pdfjs-dist * [misc] move pdfjs code into app/lib/ and scripts/, also use CamelCase * [misc] abstract the file loading and parsing of xRef tables into helper * [misc] pdfjsTests: add snapshot based tests for the Xref table parser * [misc] FSStream: throw proper error and drop commented code * [misc] FSStream: integrate throwing of MissingDataException into getter * [misc] pdfjs: fix eslint errors * [misc] pdfjs: run format_fix * [misc] pdfjs: allocate very small non empty dummy buffers explicitly * [misc] install @overleaf/o-error * [ContentCacheManager] use PDF.js Xref table instead of stream detection Co-Authored-By: Brian Gough <brian.gough@overleaf.com> * [pdfjs] parseXrefTable: handle empty PDF files gracefully Co-authored-by: Brian Gough <brian.gough@overleaf.com> 2021-05-31 04:20:25 -04:00			`const nonEmptyDummyBuffer = Buffer.alloc(1, 0)`
			`super(nonEmptyDummyBuffer, start, length, dict)`
			`delete this.bytes`
			`this.fh = fh`
[misc] bail out from pdf caching processing after 10s or earlier ...for fast compiles. 2021-06-23 09:14:28 -04:00			`this.checkDeadline = checkDeadline`
[ContentCacheManager] use PDF.js Xref table instead of stream detection (#242) * make the content cache manager tests configurable * extend stream content in unit tests * [ContentCacheManagerTests] prepare for full object caching * filesystem stream for pdfjs * working?? * cleaning up * handle overflow * [misc] install pdfjs-dist * [misc] move pdfjs code into app/lib/ and scripts/, also use CamelCase * [misc] abstract the file loading and parsing of xRef tables into helper * [misc] pdfjsTests: add snapshot based tests for the Xref table parser * [misc] FSStream: throw proper error and drop commented code * [misc] FSStream: integrate throwing of MissingDataException into getter * [misc] pdfjs: fix eslint errors * [misc] pdfjs: run format_fix * [misc] pdfjs: allocate very small non empty dummy buffers explicitly * [misc] install @overleaf/o-error * [ContentCacheManager] use PDF.js Xref table instead of stream detection Co-Authored-By: Brian Gough <brian.gough@overleaf.com> * [pdfjs] parseXrefTable: handle empty PDF files gracefully Co-authored-by: Brian Gough <brian.gough@overleaf.com> 2021-05-31 04:20:25 -04:00			`this.cachedBytes = cachedBytes \|\| []`
			`}`

			`get length() {`
			`return this.end - this.start`
			`}`

			`get isEmpty() {`
			`return this.length === 0`
			`}`

			`// Manage cached reads from the file`

			`requestRange(begin, end) {`
[misc] bail out from pdf caching processing after 10s or earlier ...for fast compiles. 2021-06-23 09:14:28 -04:00			this.checkDeadline(`request range ${begin} - ${end}`)
[ContentCacheManager] use PDF.js Xref table instead of stream detection (#242) * make the content cache manager tests configurable * extend stream content in unit tests * [ContentCacheManagerTests] prepare for full object caching * filesystem stream for pdfjs * working?? * cleaning up * handle overflow * [misc] install pdfjs-dist * [misc] move pdfjs code into app/lib/ and scripts/, also use CamelCase * [misc] abstract the file loading and parsing of xRef tables into helper * [misc] pdfjsTests: add snapshot based tests for the Xref table parser * [misc] FSStream: throw proper error and drop commented code * [misc] FSStream: integrate throwing of MissingDataException into getter * [misc] pdfjs: fix eslint errors * [misc] pdfjs: run format_fix * [misc] pdfjs: allocate very small non empty dummy buffers explicitly * [misc] install @overleaf/o-error * [ContentCacheManager] use PDF.js Xref table instead of stream detection Co-Authored-By: Brian Gough <brian.gough@overleaf.com> * [pdfjs] parseXrefTable: handle empty PDF files gracefully Co-authored-by: Brian Gough <brian.gough@overleaf.com> 2021-05-31 04:20:25 -04:00			`// expand small ranges to read a larger amount`
			`if (end - begin < BUF_SIZE) {`
			`end = begin + BUF_SIZE`
			`}`
			`end = Math.min(end, this.length)`
			`// keep a cache of previous reads with {begin,end,buffer} values`
			`const result = {`
			`begin: begin,`
			`end: end,`
[misc] run format_fix and lint:fix 2021-07-13 07:04:48 -04:00			`buffer: Buffer.alloc(end - begin, 0),`
[ContentCacheManager] use PDF.js Xref table instead of stream detection (#242) * make the content cache manager tests configurable * extend stream content in unit tests * [ContentCacheManagerTests] prepare for full object caching * filesystem stream for pdfjs * working?? * cleaning up * handle overflow * [misc] install pdfjs-dist * [misc] move pdfjs code into app/lib/ and scripts/, also use CamelCase * [misc] abstract the file loading and parsing of xRef tables into helper * [misc] pdfjsTests: add snapshot based tests for the Xref table parser * [misc] FSStream: throw proper error and drop commented code * [misc] FSStream: integrate throwing of MissingDataException into getter * [misc] pdfjs: fix eslint errors * [misc] pdfjs: run format_fix * [misc] pdfjs: allocate very small non empty dummy buffers explicitly * [misc] install @overleaf/o-error * [ContentCacheManager] use PDF.js Xref table instead of stream detection Co-Authored-By: Brian Gough <brian.gough@overleaf.com> * [pdfjs] parseXrefTable: handle empty PDF files gracefully Co-authored-by: Brian Gough <brian.gough@overleaf.com> 2021-05-31 04:20:25 -04:00			`}`
			`this.cachedBytes.push(result)`
			`return this.fh.read(result.buffer, 0, end - begin, begin)`
			`}`

			`_ensureGetPos(pos) {`
[misc] run format_fix and lint:fix 2021-07-13 07:04:48 -04:00			`const found = this.cachedBytes.find(x => {`
[ContentCacheManager] use PDF.js Xref table instead of stream detection (#242) * make the content cache manager tests configurable * extend stream content in unit tests * [ContentCacheManagerTests] prepare for full object caching * filesystem stream for pdfjs * working?? * cleaning up * handle overflow * [misc] install pdfjs-dist * [misc] move pdfjs code into app/lib/ and scripts/, also use CamelCase * [misc] abstract the file loading and parsing of xRef tables into helper * [misc] pdfjsTests: add snapshot based tests for the Xref table parser * [misc] FSStream: throw proper error and drop commented code * [misc] FSStream: integrate throwing of MissingDataException into getter * [misc] pdfjs: fix eslint errors * [misc] pdfjs: run format_fix * [misc] pdfjs: allocate very small non empty dummy buffers explicitly * [misc] install @overleaf/o-error * [ContentCacheManager] use PDF.js Xref table instead of stream detection Co-Authored-By: Brian Gough <brian.gough@overleaf.com> * [pdfjs] parseXrefTable: handle empty PDF files gracefully Co-authored-by: Brian Gough <brian.gough@overleaf.com> 2021-05-31 04:20:25 -04:00			`return x.begin <= pos && pos < x.end`
			`})`
			`if (!found) {`
			`throw new MissingDataException(pos, pos + 1)`
			`}`
			`return found`
			`}`

			`_ensureGetRange(begin, end) {`
			`end = Math.min(end, this.length) // BG: handle overflow case`
[misc] run format_fix and lint:fix 2021-07-13 07:04:48 -04:00			`const found = this.cachedBytes.find(x => {`
[ContentCacheManager] use PDF.js Xref table instead of stream detection (#242) * make the content cache manager tests configurable * extend stream content in unit tests * [ContentCacheManagerTests] prepare for full object caching * filesystem stream for pdfjs * working?? * cleaning up * handle overflow * [misc] install pdfjs-dist * [misc] move pdfjs code into app/lib/ and scripts/, also use CamelCase * [misc] abstract the file loading and parsing of xRef tables into helper * [misc] pdfjsTests: add snapshot based tests for the Xref table parser * [misc] FSStream: throw proper error and drop commented code * [misc] FSStream: integrate throwing of MissingDataException into getter * [misc] pdfjs: fix eslint errors * [misc] pdfjs: run format_fix * [misc] pdfjs: allocate very small non empty dummy buffers explicitly * [misc] install @overleaf/o-error * [ContentCacheManager] use PDF.js Xref table instead of stream detection Co-Authored-By: Brian Gough <brian.gough@overleaf.com> * [pdfjs] parseXrefTable: handle empty PDF files gracefully Co-authored-by: Brian Gough <brian.gough@overleaf.com> 2021-05-31 04:20:25 -04:00			`return x.begin <= begin && end <= x.end`
			`})`
			`if (!found) {`
			`throw new MissingDataException(begin, end)`
			`}`
			`return found`
			`}`

			`_readByte(found, pos) {`
			`return found.buffer[pos - found.begin]`
			`}`

			`_readBytes(found, pos, end) {`
			`return found.buffer.subarray(pos - found.begin, end - found.begin)`
			`}`

			`// handle accesses to the bytes`

			`ensureByte(pos) {`
			`this._ensureGetPos(pos) // may throw a MissingDataException`
			`}`

			`getByte() {`
			`const pos = this.pos`
			`if (this.pos >= this.end) {`
			`return -1`
			`}`
			`const found = this._ensureGetPos(pos)`
			`return this._readByte(found, this.pos++)`
			`}`

			`// BG: for a range, end is not included (see Buffer.subarray for example)`

			`ensureBytes(length, forceClamped = false) {`
			`const pos = this.pos`
			`this._ensureGetRange(pos, pos + length)`
			`}`

			`getBytes(length, forceClamped = false) {`
			`const pos = this.pos`
			`const strEnd = this.end`

			`const found = this._ensureGetRange(pos, pos + length)`
			`if (!length) {`
			`const subarray = this._readBytes(found, pos, strEnd)`
			// `this.bytes` is always a `Uint8Array` here.
			`return forceClamped ? new Uint8ClampedArray(subarray) : subarray`
			`}`
			`let end = pos + length`
			`if (end > strEnd) {`
			`end = strEnd`
			`}`
			`this.pos = end`
			`const subarray = this._readBytes(found, pos, end)`
			// `this.bytes` is always a `Uint8Array` here.
			`return forceClamped ? new Uint8ClampedArray(subarray) : subarray`
			`}`

			`getByteRange() {`
			`// BG: this isn't needed as far as I can tell`
			`throw new Error('not implemented')`
			`}`

			`reset() {`
			`this.pos = this.start`
			`}`

			`moveStart() {`
			`this.start = this.pos`
			`}`

			`makeSubStream(start, length, dict = null) {`
[misc] bail out from pdf caching processing after 10s or earlier ...for fast compiles. 2021-06-23 09:14:28 -04:00			this.checkDeadline(`make sub stream start=${start}/length=${length}`)
[ContentCacheManager] use PDF.js Xref table instead of stream detection (#242) * make the content cache manager tests configurable * extend stream content in unit tests * [ContentCacheManagerTests] prepare for full object caching * filesystem stream for pdfjs * working?? * cleaning up * handle overflow * [misc] install pdfjs-dist * [misc] move pdfjs code into app/lib/ and scripts/, also use CamelCase * [misc] abstract the file loading and parsing of xRef tables into helper * [misc] pdfjsTests: add snapshot based tests for the Xref table parser * [misc] FSStream: throw proper error and drop commented code * [misc] FSStream: integrate throwing of MissingDataException into getter * [misc] pdfjs: fix eslint errors * [misc] pdfjs: run format_fix * [misc] pdfjs: allocate very small non empty dummy buffers explicitly * [misc] install @overleaf/o-error * [ContentCacheManager] use PDF.js Xref table instead of stream detection Co-Authored-By: Brian Gough <brian.gough@overleaf.com> * [pdfjs] parseXrefTable: handle empty PDF files gracefully Co-authored-by: Brian Gough <brian.gough@overleaf.com> 2021-05-31 04:20:25 -04:00			`// BG: had to add this check for null length, it is being called with only`
			`// the start value at one point in the xref decoding. The intent is clear`
			`// enough`
			`// - a null length means "to the end of the file" -- not sure how it is`
			`// working in the existing pdfjs code without this.`
			`if (!length) {`
			`length = this.end - start`
			`}`
[misc] bail out from pdf caching processing after 10s or earlier ...for fast compiles. 2021-06-23 09:14:28 -04:00			`return new FSStream(`
			`this.fh,`
			`start,`
			`length,`
			`dict,`
			`this.cachedBytes,`
			`this.checkDeadline`
			`)`
[ContentCacheManager] use PDF.js Xref table instead of stream detection (#242) * make the content cache manager tests configurable * extend stream content in unit tests * [ContentCacheManagerTests] prepare for full object caching * filesystem stream for pdfjs * working?? * cleaning up * handle overflow * [misc] install pdfjs-dist * [misc] move pdfjs code into app/lib/ and scripts/, also use CamelCase * [misc] abstract the file loading and parsing of xRef tables into helper * [misc] pdfjsTests: add snapshot based tests for the Xref table parser * [misc] FSStream: throw proper error and drop commented code * [misc] FSStream: integrate throwing of MissingDataException into getter * [misc] pdfjs: fix eslint errors * [misc] pdfjs: run format_fix * [misc] pdfjs: allocate very small non empty dummy buffers explicitly * [misc] install @overleaf/o-error * [ContentCacheManager] use PDF.js Xref table instead of stream detection Co-Authored-By: Brian Gough <brian.gough@overleaf.com> * [pdfjs] parseXrefTable: handle empty PDF files gracefully Co-authored-by: Brian Gough <brian.gough@overleaf.com> 2021-05-31 04:20:25 -04:00			`}`
			`}`

			`module.exports = { FSStream }`