mirror of
https://github.com/overleaf/overleaf.git
synced 2024-11-29 07:13:39 -05:00
[ContentCacheManager] finish tracking of ranges across builds
This commit is contained in:
parent
011a228727
commit
7aeeb5a5a9
4 changed files with 272 additions and 34 deletions
|
@ -7,6 +7,7 @@ const fs = require('fs')
|
||||||
const crypto = require('crypto')
|
const crypto = require('crypto')
|
||||||
const Path = require('path')
|
const Path = require('path')
|
||||||
const Settings = require('settings-sharelatex')
|
const Settings = require('settings-sharelatex')
|
||||||
|
const pLimit = require('p-limit')
|
||||||
|
|
||||||
const MIN_CHUNK_SIZE = Settings.pdfCachingMinChunkSize
|
const MIN_CHUNK_SIZE = Settings.pdfCachingMinChunkSize
|
||||||
|
|
||||||
|
@ -27,9 +28,7 @@ async function update(contentDir, filePath) {
|
||||||
const newRanges = []
|
const newRanges = []
|
||||||
const seenHashes = new Set()
|
const seenHashes = new Set()
|
||||||
// keep track of hashes expire old ones when they reach a generation > N.
|
// keep track of hashes expire old ones when they reach a generation > N.
|
||||||
const tracker = new HashFileTracker()
|
const tracker = await HashFileTracker.from(contentDir)
|
||||||
await loadState(contentDir, tracker)
|
|
||||||
|
|
||||||
for await (const chunk of stream) {
|
for await (const chunk of stream) {
|
||||||
const pdfStreams = extractor.consume(chunk)
|
const pdfStreams = extractor.consume(chunk)
|
||||||
for (const pdfStream of pdfStreams) {
|
for (const pdfStream of pdfStreams) {
|
||||||
|
@ -48,43 +47,98 @@ async function update(contentDir, filePath) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const expiredHashes = tracker.update(ranges).findStale(5)
|
tracker.update(ranges, newRanges)
|
||||||
await deleteHashFiles(expiredHashes)
|
const reclaimedSpace = await tracker.deleteStaleHashes(5)
|
||||||
return [ranges, newRanges]
|
await tracker.flush()
|
||||||
|
return [ranges, newRanges, reclaimedSpace]
|
||||||
|
}
|
||||||
|
|
||||||
|
function getStatePath(contentDir) {
|
||||||
|
return Path.join(contentDir, '.state.v0.json')
|
||||||
}
|
}
|
||||||
|
|
||||||
class HashFileTracker {
|
class HashFileTracker {
|
||||||
constructor(contentDir) {
|
constructor(contentDir, { hashAge = [], hashSize = [] }) {
|
||||||
this.hashAge = new Map()
|
this.contentDir = contentDir
|
||||||
|
this.hashAge = new Map(hashAge)
|
||||||
|
this.hashSize = new Map(hashSize)
|
||||||
}
|
}
|
||||||
|
|
||||||
update(ranges) {
|
static async from(contentDir) {
|
||||||
|
const statePath = getStatePath(contentDir)
|
||||||
|
let state = {}
|
||||||
|
try {
|
||||||
|
const blob = await fs.promises.readFile(statePath)
|
||||||
|
state = JSON.parse(blob)
|
||||||
|
} catch (e) {}
|
||||||
|
return new HashFileTracker(contentDir, state)
|
||||||
|
}
|
||||||
|
|
||||||
|
update(ranges, newRanges) {
|
||||||
for (const [hash, age] of this.hashAge) {
|
for (const [hash, age] of this.hashAge) {
|
||||||
this.hashAge.set(hash, age + 1)
|
this.hashAge.set(hash, age + 1)
|
||||||
}
|
}
|
||||||
for (const range in ranges) {
|
for (const range of ranges) {
|
||||||
this.hashAge.set(range.hash, 0)
|
this.hashAge.set(range.hash, 0)
|
||||||
}
|
}
|
||||||
|
for (const range of newRanges) {
|
||||||
|
this.hashSize.set(range.hash, range.end - range.start)
|
||||||
|
}
|
||||||
|
return this
|
||||||
}
|
}
|
||||||
|
|
||||||
findStale(maxAge) {
|
findStale(maxAge) {
|
||||||
var stale = []
|
const stale = []
|
||||||
for (const [hash, age] of this.hashAge) {
|
for (const [hash, age] of this.hashAge) {
|
||||||
if (age > maxAge) {
|
if (age > maxAge) {
|
||||||
stale.push(hash)
|
stale.push(hash)
|
||||||
this.hashAge.delete(hash)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return stale
|
return stale
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async flush() {
|
||||||
|
const statePath = getStatePath(this.contentDir)
|
||||||
|
const blob = JSON.stringify({
|
||||||
|
hashAge: Array.from(this.hashAge.entries()),
|
||||||
|
hashSize: Array.from(this.hashSize.entries())
|
||||||
|
})
|
||||||
|
const atomicWrite = statePath + '~'
|
||||||
|
try {
|
||||||
|
await fs.promises.writeFile(atomicWrite, blob)
|
||||||
|
} catch (err) {
|
||||||
|
try {
|
||||||
|
await fs.promises.unlink(atomicWrite)
|
||||||
|
} catch (e) {}
|
||||||
|
throw err
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
await fs.promises.rename(atomicWrite, statePath)
|
||||||
|
} catch (err) {
|
||||||
|
try {
|
||||||
|
await fs.promises.unlink(atomicWrite)
|
||||||
|
} catch (e) {}
|
||||||
|
throw err
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function loadState(contentDir, tracker) {
|
async deleteStaleHashes(n) {
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
async function deleteHashFiles(n) {
|
|
||||||
// delete any hash file older than N generations
|
// delete any hash file older than N generations
|
||||||
|
const hashes = this.findStale(n)
|
||||||
|
|
||||||
|
let reclaimedSpace = 0
|
||||||
|
if (hashes.length === 0) {
|
||||||
|
return reclaimedSpace
|
||||||
|
}
|
||||||
|
|
||||||
|
await promiseMapWithLimit(10, hashes, async (hash) => {
|
||||||
|
await fs.promises.unlink(Path.join(this.contentDir, hash))
|
||||||
|
this.hashAge.delete(hash)
|
||||||
|
reclaimedSpace += this.hashSize.get(hash)
|
||||||
|
this.hashSize.delete(hash)
|
||||||
|
})
|
||||||
|
return reclaimedSpace
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
class PdfStreamsExtractor {
|
class PdfStreamsExtractor {
|
||||||
|
@ -193,6 +247,11 @@ async function writePdfStream(dir, hash, buffers) {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function promiseMapWithLimit(concurrency, array, fn) {
|
||||||
|
const limit = pLimit(concurrency)
|
||||||
|
return Promise.all(array.map((x) => limit(() => fn(x))))
|
||||||
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
HASH_REGEX: /^[0-9a-f]{64}$/,
|
HASH_REGEX: /^[0-9a-f]{64}$/,
|
||||||
update: callbackify(update)
|
update: callbackify(update)
|
||||||
|
|
|
@ -72,7 +72,10 @@ function emitPdfCachingStats(stats, timings) {
|
||||||
|
|
||||||
// How much space do the ranges use?
|
// How much space do the ranges use?
|
||||||
// This will accumulate the ranges size over time, skipping already written ranges.
|
// This will accumulate the ranges size over time, skipping already written ranges.
|
||||||
Metrics.summary('pdf-ranges-disk-size', stats['pdf-caching-new-ranges-size'])
|
Metrics.summary(
|
||||||
|
'pdf-ranges-disk-size',
|
||||||
|
stats['pdf-caching-new-ranges-size'] - stats['pdf-caching-reclaimed-space']
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
|
|
|
@ -278,10 +278,10 @@ module.exports = OutputCacheManager = {
|
||||||
const timer = new Metrics.Timer('compute-pdf-ranges')
|
const timer = new Metrics.Timer('compute-pdf-ranges')
|
||||||
ContentCacheManager.update(contentDir, outputFilePath, function (
|
ContentCacheManager.update(contentDir, outputFilePath, function (
|
||||||
err,
|
err,
|
||||||
ranges
|
result
|
||||||
) {
|
) {
|
||||||
if (err) return callback(err, outputFiles)
|
if (err) return callback(err, outputFiles)
|
||||||
const [contentRanges, newContentRanges] = ranges
|
const [contentRanges, newContentRanges, reclaimedSpace] = result
|
||||||
|
|
||||||
if (Settings.enablePdfCachingDark) {
|
if (Settings.enablePdfCachingDark) {
|
||||||
// In dark mode we are doing the computation only and do not emit
|
// In dark mode we are doing the computation only and do not emit
|
||||||
|
@ -302,6 +302,7 @@ module.exports = OutputCacheManager = {
|
||||||
(sum, next) => sum + (next.end - next.start),
|
(sum, next) => sum + (next.end - next.start),
|
||||||
0
|
0
|
||||||
)
|
)
|
||||||
|
stats['pdf-caching-reclaimed-space'] = reclaimedSpace
|
||||||
callback(null, outputFiles)
|
callback(null, outputFiles)
|
||||||
})
|
})
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -48,16 +48,19 @@ describe('ContentCacheManager', function () {
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
let contentRanges, newContentRanges
|
let contentRanges, newContentRanges, reclaimed
|
||||||
function run(filePath, done) {
|
function run(filePath, done) {
|
||||||
ContentCacheManager.update(contentDir, filePath, (err, ranges) => {
|
ContentCacheManager.update(contentDir, filePath, (err, ranges) => {
|
||||||
if (err) return done(err)
|
if (err) return done(err)
|
||||||
;[contentRanges, newContentRanges] = ranges
|
let newlyReclaimed
|
||||||
|
;[contentRanges, newContentRanges, newlyReclaimed] = ranges
|
||||||
|
reclaimed += newlyReclaimed
|
||||||
done()
|
done()
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
beforeEach(function () {
|
beforeEach(function () {
|
||||||
|
reclaimed = 0
|
||||||
contentDir =
|
contentDir =
|
||||||
'/app/output/602cee6f6460fca0ba7921e6/content/1797a7f48f9-5abc1998509dea1f'
|
'/app/output/602cee6f6460fca0ba7921e6/content/1797a7f48f9-5abc1998509dea1f'
|
||||||
pdfPath =
|
pdfPath =
|
||||||
|
@ -70,6 +73,18 @@ describe('ContentCacheManager', function () {
|
||||||
fs = {
|
fs = {
|
||||||
createReadStream: sinon.stub().returns(Readable.from([])),
|
createReadStream: sinon.stub().returns(Readable.from([])),
|
||||||
promises: {
|
promises: {
|
||||||
|
async writeFile(name, blob) {
|
||||||
|
const file = new FakeFile()
|
||||||
|
await file.write(Buffer.from(blob))
|
||||||
|
await file.close()
|
||||||
|
files[name] = file
|
||||||
|
},
|
||||||
|
async readFile(name) {
|
||||||
|
if (!files[name]) {
|
||||||
|
throw new Error()
|
||||||
|
}
|
||||||
|
return files[name].toJSON().contents
|
||||||
|
},
|
||||||
async open(name) {
|
async open(name) {
|
||||||
files[name] = new FakeFile()
|
files[name] = new FakeFile()
|
||||||
return files[name]
|
return files[name]
|
||||||
|
@ -86,7 +101,12 @@ describe('ContentCacheManager', function () {
|
||||||
files[newName] = files[oldName]
|
files[newName] = files[oldName]
|
||||||
delete files[oldName]
|
delete files[oldName]
|
||||||
},
|
},
|
||||||
unlink: sinon.stub().resolves()
|
async unlink(name) {
|
||||||
|
if (!files[name]) {
|
||||||
|
throw new Error()
|
||||||
|
}
|
||||||
|
delete files[name]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
@ -99,9 +119,12 @@ describe('ContentCacheManager', function () {
|
||||||
|
|
||||||
describe('when the ranges are split across chunks', function () {
|
describe('when the ranges are split across chunks', function () {
|
||||||
const RANGE_1 = 'stream123endstream'
|
const RANGE_1 = 'stream123endstream'
|
||||||
const RANGE_2 = 'stream(|)endstream'
|
const RANGE_2 = 'stream(||)endstream'
|
||||||
const RANGE_3 = 'stream!$%endstream'
|
const RANGE_3 = 'stream!$%/=endstream'
|
||||||
beforeEach(function (done) {
|
const h1 = hash(RANGE_1)
|
||||||
|
const h2 = hash(RANGE_2)
|
||||||
|
const h3 = hash(RANGE_3)
|
||||||
|
function runWithSplitStream(done) {
|
||||||
fs.createReadStream
|
fs.createReadStream
|
||||||
.withArgs(pdfPath)
|
.withArgs(pdfPath)
|
||||||
.returns(
|
.returns(
|
||||||
|
@ -109,12 +132,15 @@ describe('ContentCacheManager', function () {
|
||||||
Buffer.from('abcstr'),
|
Buffer.from('abcstr'),
|
||||||
Buffer.from('eam123endstreamABC'),
|
Buffer.from('eam123endstreamABC'),
|
||||||
Buffer.from('str'),
|
Buffer.from('str'),
|
||||||
Buffer.from('eam(|'),
|
Buffer.from('eam(||'),
|
||||||
Buffer.from(')end'),
|
Buffer.from(')end'),
|
||||||
Buffer.from('stream-_~stream!$%endstream')
|
Buffer.from('stream-_~stream!$%/=endstream')
|
||||||
])
|
])
|
||||||
)
|
)
|
||||||
run(pdfPath, done)
|
run(pdfPath, done)
|
||||||
|
}
|
||||||
|
beforeEach(function (done) {
|
||||||
|
runWithSplitStream(done)
|
||||||
})
|
})
|
||||||
|
|
||||||
it('should produce three ranges', function () {
|
it('should produce three ranges', function () {
|
||||||
|
@ -130,12 +156,12 @@ describe('ContentCacheManager', function () {
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
start: 24,
|
start: 24,
|
||||||
end: 42,
|
end: 43,
|
||||||
hash: hash(RANGE_2)
|
hash: hash(RANGE_2)
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
start: 45,
|
start: 46,
|
||||||
end: 63,
|
end: 66,
|
||||||
hash: hash(RANGE_3)
|
hash: hash(RANGE_3)
|
||||||
}
|
}
|
||||||
])
|
])
|
||||||
|
@ -143,17 +169,32 @@ describe('ContentCacheManager', function () {
|
||||||
|
|
||||||
it('should store the contents', function () {
|
it('should store the contents', function () {
|
||||||
expect(JSON.parse(JSON.stringify(files))).to.deep.equal({
|
expect(JSON.parse(JSON.stringify(files))).to.deep.equal({
|
||||||
[Path.join(contentDir, hash(RANGE_1))]: {
|
[Path.join(contentDir, h1)]: {
|
||||||
contents: RANGE_1,
|
contents: RANGE_1,
|
||||||
closed: true
|
closed: true
|
||||||
},
|
},
|
||||||
[Path.join(contentDir, hash(RANGE_2))]: {
|
[Path.join(contentDir, h2)]: {
|
||||||
contents: RANGE_2,
|
contents: RANGE_2,
|
||||||
closed: true
|
closed: true
|
||||||
},
|
},
|
||||||
[Path.join(contentDir, hash(RANGE_3))]: {
|
[Path.join(contentDir, h3)]: {
|
||||||
contents: RANGE_3,
|
contents: RANGE_3,
|
||||||
closed: true
|
closed: true
|
||||||
|
},
|
||||||
|
[Path.join(contentDir, '.state.v0.json')]: {
|
||||||
|
contents: JSON.stringify({
|
||||||
|
hashAge: [
|
||||||
|
[h1, 0],
|
||||||
|
[h2, 0],
|
||||||
|
[h3, 0]
|
||||||
|
],
|
||||||
|
hashSize: [
|
||||||
|
[h1, 18],
|
||||||
|
[h2, 19],
|
||||||
|
[h3, 20]
|
||||||
|
]
|
||||||
|
}),
|
||||||
|
closed: true
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
@ -161,6 +202,140 @@ describe('ContentCacheManager', function () {
|
||||||
it('should mark all ranges as new', function () {
|
it('should mark all ranges as new', function () {
|
||||||
expect(contentRanges).to.deep.equal(newContentRanges)
|
expect(contentRanges).to.deep.equal(newContentRanges)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
describe('when re-running with one stream removed', function () {
|
||||||
|
function runWithOneSplitStreamRemoved(done) {
|
||||||
|
fs.createReadStream
|
||||||
|
.withArgs(pdfPath)
|
||||||
|
.returns(
|
||||||
|
Readable.from([
|
||||||
|
Buffer.from('abcstr'),
|
||||||
|
Buffer.from('eam123endstreamABC'),
|
||||||
|
Buffer.from('stream!$%/=endstream')
|
||||||
|
])
|
||||||
|
)
|
||||||
|
run(pdfPath, done)
|
||||||
|
}
|
||||||
|
beforeEach(function (done) {
|
||||||
|
runWithOneSplitStreamRemoved(done)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('should produce two ranges', function () {
|
||||||
|
expect(contentRanges).to.have.length(2)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('should find the correct offsets', function () {
|
||||||
|
expect(contentRanges).to.deep.equal([
|
||||||
|
{
|
||||||
|
start: 3,
|
||||||
|
end: 21,
|
||||||
|
hash: hash(RANGE_1)
|
||||||
|
},
|
||||||
|
{
|
||||||
|
start: 24,
|
||||||
|
end: 44,
|
||||||
|
hash: hash(RANGE_3)
|
||||||
|
}
|
||||||
|
])
|
||||||
|
})
|
||||||
|
|
||||||
|
it('should update the age of the 2nd range', function () {
|
||||||
|
expect(JSON.parse(JSON.stringify(files))).to.deep.equal({
|
||||||
|
[Path.join(contentDir, h1)]: {
|
||||||
|
contents: RANGE_1,
|
||||||
|
closed: true
|
||||||
|
},
|
||||||
|
[Path.join(contentDir, h2)]: {
|
||||||
|
contents: RANGE_2,
|
||||||
|
closed: true
|
||||||
|
},
|
||||||
|
[Path.join(contentDir, h3)]: {
|
||||||
|
contents: RANGE_3,
|
||||||
|
closed: true
|
||||||
|
},
|
||||||
|
[Path.join(contentDir, '.state.v0.json')]: {
|
||||||
|
contents: JSON.stringify({
|
||||||
|
hashAge: [
|
||||||
|
[h1, 0],
|
||||||
|
[h2, 1],
|
||||||
|
[h3, 0]
|
||||||
|
],
|
||||||
|
hashSize: [
|
||||||
|
[h1, 18],
|
||||||
|
[h2, 19],
|
||||||
|
[h3, 20]
|
||||||
|
]
|
||||||
|
}),
|
||||||
|
closed: true
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
it('should find no new ranges', function () {
|
||||||
|
expect(newContentRanges).to.deep.equal([])
|
||||||
|
})
|
||||||
|
|
||||||
|
describe('when re-running 5 more times', function () {
|
||||||
|
for (let i = 0; i < 5; i++) {
|
||||||
|
beforeEach(function (done) {
|
||||||
|
runWithOneSplitStreamRemoved(done)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
it('should still produce two ranges', function () {
|
||||||
|
expect(contentRanges).to.have.length(2)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('should still find the correct offsets', function () {
|
||||||
|
expect(contentRanges).to.deep.equal([
|
||||||
|
{
|
||||||
|
start: 3,
|
||||||
|
end: 21,
|
||||||
|
hash: hash(RANGE_1)
|
||||||
|
},
|
||||||
|
{
|
||||||
|
start: 24,
|
||||||
|
end: 44,
|
||||||
|
hash: hash(RANGE_3)
|
||||||
|
}
|
||||||
|
])
|
||||||
|
})
|
||||||
|
|
||||||
|
it('should delete the 2nd range', function () {
|
||||||
|
expect(JSON.parse(JSON.stringify(files))).to.deep.equal({
|
||||||
|
[Path.join(contentDir, h1)]: {
|
||||||
|
contents: RANGE_1,
|
||||||
|
closed: true
|
||||||
|
},
|
||||||
|
[Path.join(contentDir, h3)]: {
|
||||||
|
contents: RANGE_3,
|
||||||
|
closed: true
|
||||||
|
},
|
||||||
|
[Path.join(contentDir, '.state.v0.json')]: {
|
||||||
|
contents: JSON.stringify({
|
||||||
|
hashAge: [
|
||||||
|
[h1, 0],
|
||||||
|
[h3, 0]
|
||||||
|
],
|
||||||
|
hashSize: [
|
||||||
|
[h1, 18],
|
||||||
|
[h3, 20]
|
||||||
|
]
|
||||||
|
}),
|
||||||
|
closed: true
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
it('should find no new ranges', function () {
|
||||||
|
expect(newContentRanges).to.deep.equal([])
|
||||||
|
})
|
||||||
|
|
||||||
|
it('should yield the reclaimed space', function () {
|
||||||
|
expect(reclaimed).to.equal(RANGE_2.length)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
})
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
Loading…
Reference in a new issue