overleaf/services/clsi/app/lib/pdfjs/FSStream.js
Jakob Ackermann 294088fb27 [ContentCacheManager] use PDF.js Xref table instead of stream detection (#242)
* make the content cache manager tests configurable

* extend stream content in unit tests

* [ContentCacheManagerTests] prepare for full object caching

* filesystem stream for pdfjs

* working??

* cleaning up

* handle overflow

* [misc] install pdfjs-dist

* [misc] move pdfjs code into app/lib/ and scripts/, also use CamelCase

* [misc] abstract the file loading and parsing of xRef tables into helper

* [misc] pdfjsTests: add snapshot based tests for the Xref table parser

* [misc] FSStream: throw proper error and drop commented code

* [misc] FSStream: integrate throwing of MissingDataException into getter

* [misc] pdfjs: fix eslint errors

* [misc] pdfjs: run format_fix

* [misc] pdfjs: allocate very small non empty dummy buffers explicitly

* [misc] install @overleaf/o-error

* [ContentCacheManager] use PDF.js Xref table instead of stream detection

Co-Authored-By: Brian Gough <brian.gough@overleaf.com>

* [pdfjs] parseXrefTable: handle empty PDF files gracefully

Co-authored-by: Brian Gough <brian.gough@overleaf.com>
2021-05-31 09:20:25 +01:00

138 lines
3.6 KiB
JavaScript

const { Stream } = require('pdfjs-dist/lib/core/stream')
const { MissingDataException } = require('pdfjs-dist/lib/core/core_utils')
const BUF_SIZE = 1024 // read from the file in 1024 byte pages
class FSStream extends Stream {
constructor(fh, start, length, dict, cachedBytes) {
const nonEmptyDummyBuffer = Buffer.alloc(1, 0)
super(nonEmptyDummyBuffer, start, length, dict)
delete this.bytes
this.fh = fh
this.cachedBytes = cachedBytes || []
}
get length() {
return this.end - this.start
}
get isEmpty() {
return this.length === 0
}
// Manage cached reads from the file
requestRange(begin, end) {
// expand small ranges to read a larger amount
if (end - begin < BUF_SIZE) {
end = begin + BUF_SIZE
}
end = Math.min(end, this.length)
// keep a cache of previous reads with {begin,end,buffer} values
const result = {
begin: begin,
end: end,
buffer: Buffer.alloc(end - begin, 0)
}
this.cachedBytes.push(result)
return this.fh.read(result.buffer, 0, end - begin, begin)
}
_ensureGetPos(pos) {
const found = this.cachedBytes.find((x) => {
return x.begin <= pos && pos < x.end
})
if (!found) {
throw new MissingDataException(pos, pos + 1)
}
return found
}
_ensureGetRange(begin, end) {
end = Math.min(end, this.length) // BG: handle overflow case
const found = this.cachedBytes.find((x) => {
return x.begin <= begin && end <= x.end
})
if (!found) {
throw new MissingDataException(begin, end)
}
return found
}
_readByte(found, pos) {
return found.buffer[pos - found.begin]
}
_readBytes(found, pos, end) {
return found.buffer.subarray(pos - found.begin, end - found.begin)
}
// handle accesses to the bytes
ensureByte(pos) {
this._ensureGetPos(pos) // may throw a MissingDataException
}
getByte() {
const pos = this.pos
if (this.pos >= this.end) {
return -1
}
const found = this._ensureGetPos(pos)
return this._readByte(found, this.pos++)
}
// BG: for a range, end is not included (see Buffer.subarray for example)
ensureBytes(length, forceClamped = false) {
const pos = this.pos
this._ensureGetRange(pos, pos + length)
}
getBytes(length, forceClamped = false) {
const pos = this.pos
const strEnd = this.end
const found = this._ensureGetRange(pos, pos + length)
if (!length) {
const subarray = this._readBytes(found, pos, strEnd)
// `this.bytes` is always a `Uint8Array` here.
return forceClamped ? new Uint8ClampedArray(subarray) : subarray
}
let end = pos + length
if (end > strEnd) {
end = strEnd
}
this.pos = end
const subarray = this._readBytes(found, pos, end)
// `this.bytes` is always a `Uint8Array` here.
return forceClamped ? new Uint8ClampedArray(subarray) : subarray
}
getByteRange() {
// BG: this isn't needed as far as I can tell
throw new Error('not implemented')
}
reset() {
this.pos = this.start
}
moveStart() {
this.start = this.pos
}
makeSubStream(start, length, dict = null) {
// BG: had to add this check for null length, it is being called with only
// the start value at one point in the xref decoding. The intent is clear
// enough
// - a null length means "to the end of the file" -- not sure how it is
// working in the existing pdfjs code without this.
if (!length) {
length = this.end - start
}
return new FSStream(this.fh, start, length, dict, this.cachedBytes)
}
}
module.exports = { FSStream }