Merge pull request #4649 from overleaf/jpa-fs-based-caching

[perf] UrlCache: pure fs based cache state for downloads

GitOrigin-RevId: d19afc396324d4c3318b31620c8ad0c04e0544ce
This commit is contained in:
Jakob Ackermann 2021-10-06 10:11:59 +02:00 committed by Copybot
parent 98db86b8f0
commit 02918e7483
8 changed files with 154 additions and 567 deletions

View file

@ -76,12 +76,16 @@ module.exports = ResourceWriter = {
)
}
)
} else {
logger.log(
{ project_id: request.project_id, user_id: request.user_id },
'full sync'
)
return this.saveAllResourcesToDisk(
}
logger.log(
{ project_id: request.project_id, user_id: request.user_id },
'full sync'
)
UrlCache.createProjectDir(request.project_id, error => {
if (error != null) {
return callback(error)
}
this.saveAllResourcesToDisk(
request.project_id,
request.resources,
basePath,
@ -102,7 +106,7 @@ module.exports = ResourceWriter = {
)
}
)
}
})
},
saveIncrementalResourcesToDisk(project_id, resources, basePath, callback) {

View file

@ -12,270 +12,71 @@
* DS207: Consider shorter variations of null checks
* Full docs: https://github.com/decaffeinate/decaffeinate/blob/master/docs/suggestions.md
*/
let UrlCache
const db = require('./db')
const dbQueue = require('./DbQueue')
const UrlFetcher = require('./UrlFetcher')
const Settings = require('@overleaf/settings')
const crypto = require('crypto')
const fs = require('fs')
const logger = require('logger-sharelatex')
const async = require('async')
const Metrics = require('./Metrics')
const fse = require('fs-extra')
const Path = require('path')
const { callbackify } = require('util')
module.exports = UrlCache = {
downloadUrlToFile(project_id, url, destPath, lastModified, callback) {
if (callback == null) {
callback = function (error) {}
}
return UrlCache._ensureUrlIsInCache(
project_id,
url,
lastModified,
(error, pathToCachedUrl) => {
if (error != null) {
return callback(error)
}
return fs.copyFile(pathToCachedUrl, destPath, function (error) {
if (error != null) {
logger.error(
{ err: error, from: pathToCachedUrl, to: destPath },
'error copying file from cache'
)
return UrlCache._clearUrlDetails(project_id, url, () =>
callback(error)
)
} else {
return callback(error)
}
})
}
)
},
const PENDING_DOWNLOADS = new Map()
clearProject(project_id, callback) {
if (callback == null) {
callback = function (error) {}
}
return UrlCache._findAllUrlsInProject(project_id, function (error, urls) {
logger.log(
{ project_id, url_count: urls.length },
'clearing project URLs'
)
if (error != null) {
return callback(error)
}
const jobs = Array.from(urls || []).map(url =>
(
url => callback =>
UrlCache._clearUrlFromCache(project_id, url, function (error) {
if (error != null) {
logger.error(
{ err: error, project_id, url },
'error clearing project URL'
)
}
return callback()
})
)(url)
)
return async.series(jobs, callback)
})
},
function getProjectDir(projectId) {
return Path.join(Settings.path.clsiCacheDir, projectId)
}
_ensureUrlIsInCache(project_id, url, lastModified, callback) {
if (callback == null) {
callback = function (error, pathOnDisk) {}
}
if (lastModified != null) {
// MYSQL only stores dates to an accuracy of a second but the incoming lastModified might have milliseconds.
// So round down to seconds
lastModified = new Date(Math.floor(lastModified.getTime() / 1000) * 1000)
}
return UrlCache._doesUrlNeedDownloading(
project_id,
url,
lastModified,
(error, needsDownloading) => {
if (error != null) {
return callback(error)
}
if (needsDownloading) {
logger.log({ url, lastModified }, 'downloading URL')
return UrlFetcher.pipeUrlToFileWithRetry(
url,
UrlCache._cacheFilePathForUrl(project_id, url),
error => {
if (error != null) {
return callback(error)
}
return UrlCache._updateOrCreateUrlDetails(
project_id,
url,
lastModified,
error => {
if (error != null) {
return callback(error)
}
return callback(
null,
UrlCache._cacheFilePathForUrl(project_id, url)
)
}
)
}
)
} else {
logger.log({ url, lastModified }, 'URL is up to date in cache')
return callback(null, UrlCache._cacheFilePathForUrl(project_id, url))
}
}
)
},
function getCachePath(projectId, url, lastModified) {
// The url is a filestore URL.
// It is sufficient to look at the path and mtime for uniqueness.
const mtime = (lastModified && lastModified.getTime()) || 0
const key = new URL(url).pathname.replace(/\//g, '-') + '-' + mtime
return Path.join(getProjectDir(projectId), key)
}
_doesUrlNeedDownloading(project_id, url, lastModified, callback) {
if (callback == null) {
callback = function (error, needsDownloading) {}
}
if (lastModified == null) {
return callback(null, true)
}
return UrlCache._findUrlDetails(
project_id,
url,
function (error, urlDetails) {
if (error != null) {
return callback(error)
}
if (
urlDetails == null ||
urlDetails.lastModified == null ||
urlDetails.lastModified.getTime() < lastModified.getTime()
) {
return callback(null, true)
} else {
return callback(null, false)
}
}
)
},
async function clearProject(projectId) {
await fse.remove(getProjectDir(projectId))
}
_cacheFileNameForUrl(project_id, url) {
return project_id + ':' + crypto.createHash('md5').update(url).digest('hex')
},
async function createProjectDir(projectId) {
await fs.promises.mkdir(getProjectDir(projectId), { recursive: true })
}
_cacheFilePathForUrl(project_id, url) {
return `${Settings.path.clsiCacheDir}/${UrlCache._cacheFileNameForUrl(
project_id,
url
)}`
},
_clearUrlFromCache(project_id, url, callback) {
if (callback == null) {
callback = function (error) {}
async function downloadUrlToFile(projectId, url, destPath, lastModified) {
const cachePath = getCachePath(projectId, url, lastModified)
try {
await fs.promises.copyFile(cachePath, destPath)
return
} catch (e) {
if (e.code !== 'ENOENT') {
throw e
}
return UrlCache._clearUrlDetails(project_id, url, function (error) {
if (error != null) {
return callback(error)
}
return UrlCache._deleteUrlCacheFromDisk(
project_id,
url,
function (error) {
if (error != null) {
return callback(error)
}
return callback(null)
}
)
})
},
}
await download(url, cachePath)
await fs.promises.copyFile(cachePath, destPath)
}
_deleteUrlCacheFromDisk(project_id, url, callback) {
if (callback == null) {
callback = function (error) {}
}
return fs.unlink(
UrlCache._cacheFilePathForUrl(project_id, url),
function (error) {
if (error != null && error.code !== 'ENOENT') {
// no error if the file isn't present
return callback(error)
} else {
return callback()
}
}
)
},
async function download(url, cachePath) {
let pending = PENDING_DOWNLOADS.get(cachePath)
if (pending) {
return pending
}
_findUrlDetails(project_id, url, callback) {
if (callback == null) {
callback = function (error, urlDetails) {}
}
const timer = new Metrics.Timer('db-find-url-details')
const job = cb =>
db.UrlCache.findOne({ where: { url, project_id } })
.then(urlDetails => cb(null, urlDetails))
.error(cb)
dbQueue.queue.push(job, (error, urlDetails) => {
timer.done()
callback(error, urlDetails)
})
},
pending = UrlFetcher.promises.pipeUrlToFileWithRetry(url, cachePath)
PENDING_DOWNLOADS.set(cachePath, pending)
try {
await pending
} finally {
PENDING_DOWNLOADS.delete(cachePath)
}
}
_updateOrCreateUrlDetails(project_id, url, lastModified, callback) {
if (callback == null) {
callback = function (error) {}
}
const timer = new Metrics.Timer('db-update-or-create-url-details')
const job = cb =>
db.UrlCache.findOrCreate({ where: { url, project_id } })
.spread((urlDetails, created) =>
urlDetails
.update({ lastModified })
.then(() => cb())
.error(cb)
)
.error(cb)
dbQueue.queue.push(job, error => {
timer.done()
callback(error)
})
},
_clearUrlDetails(project_id, url, callback) {
if (callback == null) {
callback = function (error) {}
}
const timer = new Metrics.Timer('db-clear-url-details')
const job = cb =>
db.UrlCache.destroy({ where: { url, project_id } })
.then(() => cb(null))
.error(cb)
dbQueue.queue.push(job, error => {
timer.done()
callback(error)
})
},
_findAllUrlsInProject(project_id, callback) {
if (callback == null) {
callback = function (error, urls) {}
}
const timer = new Metrics.Timer('db-find-urls-in-project')
const job = cb =>
db.UrlCache.findAll({ where: { project_id } })
.then(urlEntries =>
cb(
null,
urlEntries.map(entry => entry.url)
)
)
.error(cb)
dbQueue.queue.push(job, (err, urls) => {
timer.done()
callback(err, urls)
})
module.exports = {
clearProject: callbackify(clearProject),
createProjectDir: callbackify(createProjectDir),
downloadUrlToFile: callbackify(downloadUrlToFile),
promises: {
clearProject,
createProjectDir,
downloadUrlToFile,
},
}

View file

@ -19,6 +19,7 @@ const logger = require('logger-sharelatex')
const settings = require('@overleaf/settings')
const URL = require('url')
const async = require('async')
const { promisify } = require('util')
const oneMinute = 60 * 1000
@ -79,7 +80,8 @@ module.exports = UrlFetcher = {
return urlStream.on('response', function (res) {
if (res.statusCode >= 200 && res.statusCode < 300) {
const fileStream = fs.createWriteStream(filePath)
const atomicWrite = filePath + '~'
const fileStream = fs.createWriteStream(atomicWrite)
// attach handlers before setting up pipes
fileStream.on('error', function (error) {
@ -87,7 +89,7 @@ module.exports = UrlFetcher = {
{ err: error, url, filePath },
'error writing file into cache'
)
return fs.unlink(filePath, function (err) {
return fs.unlink(atomicWrite, function (err) {
if (err != null) {
logger.err({ err, filePath }, 'error deleting file from cache')
}
@ -97,7 +99,13 @@ module.exports = UrlFetcher = {
fileStream.on('finish', function () {
logger.log({ url, filePath }, 'finished writing file into cache')
return callbackOnce()
fs.rename(atomicWrite, filePath, error => {
if (error) {
fs.unlink(atomicWrite, () => callbackOnce(error))
} else {
callbackOnce()
}
})
})
fileStream.on('pipe', () =>
@ -129,3 +137,7 @@ module.exports = UrlFetcher = {
})
},
}
module.exports.promises = {
pipeUrlToFileWithRetry: promisify(UrlFetcher.pipeUrlToFileWithRetry),
}

View file

@ -250,8 +250,8 @@ describe('Url Caching', function () {
return Server.getFile.restore()
})
return it('should not download the image again', function () {
return Server.getFile.called.should.equal(false)
return it('should download the other revision', function () {
return Server.getFile.called.should.equal(true)
})
})

View file

@ -18,5 +18,5 @@ SandboxedModule.configure({
err() {},
},
},
globals: { Buffer, console, process },
globals: { Buffer, console, process, URL },
})

View file

@ -30,7 +30,9 @@ describe('ResourceWriter', function () {
unlink: sinon.stub().callsArg(1),
}),
'./ResourceStateManager': (this.ResourceStateManager = {}),
'./UrlCache': (this.UrlCache = {}),
'./UrlCache': (this.UrlCache = {
createProjectDir: sinon.stub().yields(),
}),
'./OutputFileFinder': (this.OutputFileFinder = {}),
'./Metrics': (this.Metrics = {
inc: sinon.stub(),

View file

@ -12,342 +12,101 @@
*/
const SandboxedModule = require('sandboxed-module')
const sinon = require('sinon')
const { expect } = require('chai')
const modulePath = require('path').join(__dirname, '../../../app/js/UrlCache')
const { EventEmitter } = require('events')
describe('UrlCache', function () {
beforeEach(function () {
this.callback = sinon.stub()
this.url = 'www.example.com/file'
this.project_id = 'project-id-123'
this.url =
'http://filestore/project/60b0dd39c418bc00598a0d22/file/60ae721ffb1d920027d3201f'
this.project_id = '60b0dd39c418bc00598a0d22'
return (this.UrlCache = SandboxedModule.require(modulePath, {
requires: {
'./db': {},
'./UrlFetcher': (this.UrlFetcher = {}),
'./UrlFetcher': (this.UrlFetcher = {
promises: { pipeUrlToFileWithRetry: sinon.stub().resolves() },
}),
'@overleaf/settings': (this.Settings = {
path: { clsiCacheDir: '/cache/dir' },
}),
fs: (this.fs = { copyFile: sinon.stub().yields() }),
'fs-extra': (this.fse = { remove: sinon.stub().resolves() }),
fs: (this.fs = {
promises: {
copyFile: sinon.stub().resolves(),
},
}),
},
}))
})
describe('_doesUrlNeedDownloading', function () {
beforeEach(function () {
this.lastModified = new Date()
return (this.lastModifiedRoundedToSeconds = new Date(
Math.floor(this.lastModified.getTime() / 1000) * 1000
))
})
describe('when URL does not exist in cache', function () {
beforeEach(function () {
this.UrlCache._findUrlDetails = sinon.stub().callsArgWith(2, null, null)
return this.UrlCache._doesUrlNeedDownloading(
this.project_id,
this.url,
this.lastModified,
this.callback
)
})
return it('should return the callback with true', function () {
return this.callback.calledWith(null, true).should.equal(true)
})
})
return describe('when URL does exist in cache', function () {
beforeEach(function () {
this.urlDetails = {}
return (this.UrlCache._findUrlDetails = sinon
.stub()
.callsArgWith(2, null, this.urlDetails))
})
describe('when the modified date is more recent than the cached modified date', function () {
beforeEach(function () {
this.urlDetails.lastModified = new Date(
this.lastModified.getTime() - 1000
)
return this.UrlCache._doesUrlNeedDownloading(
this.project_id,
this.url,
this.lastModified,
this.callback
)
})
it('should get the url details', function () {
return this.UrlCache._findUrlDetails
.calledWith(this.project_id, this.url)
.should.equal(true)
})
return it('should return the callback with true', function () {
return this.callback.calledWith(null, true).should.equal(true)
})
})
describe('when the cached modified date is more recent than the modified date', function () {
beforeEach(function () {
this.urlDetails.lastModified = new Date(
this.lastModified.getTime() + 1000
)
return this.UrlCache._doesUrlNeedDownloading(
this.project_id,
this.url,
this.lastModified,
this.callback
)
})
return it('should return the callback with false', function () {
return this.callback.calledWith(null, false).should.equal(true)
})
})
describe('when the cached modified date is equal to the modified date', function () {
beforeEach(function () {
this.urlDetails.lastModified = this.lastModified
return this.UrlCache._doesUrlNeedDownloading(
this.project_id,
this.url,
this.lastModified,
this.callback
)
})
return it('should return the callback with false', function () {
return this.callback.calledWith(null, false).should.equal(true)
})
})
describe('when the provided modified date does not exist', function () {
beforeEach(function () {
this.lastModified = null
return this.UrlCache._doesUrlNeedDownloading(
this.project_id,
this.url,
this.lastModified,
this.callback
)
})
return it('should return the callback with true', function () {
return this.callback.calledWith(null, true).should.equal(true)
})
})
return describe('when the URL does not have a modified date', function () {
beforeEach(function () {
this.urlDetails.lastModified = null
return this.UrlCache._doesUrlNeedDownloading(
this.project_id,
this.url,
this.lastModified,
this.callback
)
})
return it('should return the callback with true', function () {
return this.callback.calledWith(null, true).should.equal(true)
})
})
})
})
describe('_ensureUrlIsInCache', function () {
beforeEach(function () {
this.UrlFetcher.pipeUrlToFileWithRetry = sinon.stub().callsArg(2)
return (this.UrlCache._updateOrCreateUrlDetails = sinon
.stub()
.callsArg(3))
})
describe('when the URL needs updating', function () {
beforeEach(function () {
this.UrlCache._doesUrlNeedDownloading = sinon
.stub()
.callsArgWith(3, null, true)
return this.UrlCache._ensureUrlIsInCache(
this.project_id,
this.url,
this.lastModified,
this.callback
)
})
it('should check that the url needs downloading', function () {
return this.UrlCache._doesUrlNeedDownloading
.calledWith(
this.project_id,
this.url,
this.lastModifiedRoundedToSeconds
)
.should.equal(true)
})
it('should download the URL to the cache file', function () {
return this.UrlFetcher.pipeUrlToFileWithRetry
.calledWith(
this.url,
this.UrlCache._cacheFilePathForUrl(this.project_id, this.url)
)
.should.equal(true)
})
it('should update the database entry', function () {
return this.UrlCache._updateOrCreateUrlDetails
.calledWith(
this.project_id,
this.url,
this.lastModifiedRoundedToSeconds
)
.should.equal(true)
})
return it('should return the callback with the cache file path', function () {
return this.callback
.calledWith(
null,
this.UrlCache._cacheFilePathForUrl(this.project_id, this.url)
)
.should.equal(true)
})
})
return describe('when the URL does not need updating', function () {
beforeEach(function () {
this.UrlCache._doesUrlNeedDownloading = sinon
.stub()
.callsArgWith(3, null, false)
return this.UrlCache._ensureUrlIsInCache(
this.project_id,
this.url,
this.lastModified,
this.callback
)
})
it('should not download the URL to the cache file', function () {
return this.UrlFetcher.pipeUrlToFileWithRetry.called.should.equal(false)
})
return it('should return the callback with the cache file path', function () {
return this.callback
.calledWith(
null,
this.UrlCache._cacheFilePathForUrl(this.project_id, this.url)
)
.should.equal(true)
})
})
})
describe('downloadUrlToFile', function () {
beforeEach(function () {
this.cachePath = 'path/to/cached/url'
this.destPath = 'path/to/destination'
this.UrlCache._ensureUrlIsInCache = sinon
.stub()
.callsArgWith(3, null, this.cachePath)
return this.UrlCache.downloadUrlToFile(
})
it('should not download on the happy path', function (done) {
this.UrlCache.downloadUrlToFile(
this.project_id,
this.url,
this.destPath,
this.lastModified,
this.callback
error => {
expect(error).to.not.exist
expect(
this.UrlFetcher.promises.pipeUrlToFileWithRetry.called
).to.equal(false)
done()
}
)
})
it('should ensure the URL is downloaded and updated in the cache', function () {
return this.UrlCache._ensureUrlIsInCache
.calledWith(this.project_id, this.url, this.lastModified)
.should.equal(true)
})
it('should download on cache miss', function (done) {
const codedError = new Error()
codedError.code = 'ENOENT'
this.fs.promises.copyFile.onCall(0).rejects(codedError)
this.fs.promises.copyFile.onCall(1).resolves()
it('should copy the file to the new location', function () {
return this.fs.copyFile
.calledWith(this.cachePath, this.destPath)
.should.equal(true)
})
return it('should call the callback', function () {
return this.callback.called.should.equal(true)
})
})
describe('_deleteUrlCacheFromDisk', function () {
beforeEach(function () {
this.fs.unlink = sinon.stub().callsArg(1)
return this.UrlCache._deleteUrlCacheFromDisk(
this.UrlCache.downloadUrlToFile(
this.project_id,
this.url,
this.callback
this.destPath,
this.lastModified,
error => {
expect(error).to.not.exist
expect(
this.UrlFetcher.promises.pipeUrlToFileWithRetry.called
).to.equal(true)
done()
}
)
})
it('should delete the cache file', function () {
return this.fs.unlink
.calledWith(
this.UrlCache._cacheFilePathForUrl(this.project_id, this.url)
)
.should.equal(true)
})
return it('should call the callback', function () {
return this.callback.called.should.equal(true)
})
})
describe('_clearUrlFromCache', function () {
beforeEach(function () {
this.UrlCache._deleteUrlCacheFromDisk = sinon.stub().callsArg(2)
this.UrlCache._clearUrlDetails = sinon.stub().callsArg(2)
return this.UrlCache._clearUrlFromCache(
it('should raise non cache-miss errors', function (done) {
const codedError = new Error()
codedError.code = 'FOO'
this.fs.promises.copyFile.rejects(codedError)
this.UrlCache.downloadUrlToFile(
this.project_id,
this.url,
this.callback
this.destPath,
this.lastModified,
error => {
expect(error).to.equal(codedError)
done()
}
)
})
it('should delete the file on disk', function () {
return this.UrlCache._deleteUrlCacheFromDisk
.calledWith(this.project_id, this.url)
.should.equal(true)
})
it('should clear the entry in the database', function () {
return this.UrlCache._clearUrlDetails
.calledWith(this.project_id, this.url)
.should.equal(true)
})
return it('should call the callback', function () {
return this.callback.called.should.equal(true)
})
})
return describe('clearProject', function () {
beforeEach(function () {
this.urls = ['www.example.com/file1', 'www.example.com/file2']
this.UrlCache._findAllUrlsInProject = sinon
.stub()
.callsArgWith(1, null, this.urls)
this.UrlCache._clearUrlFromCache = sinon.stub().callsArg(2)
return this.UrlCache.clearProject(this.project_id, this.callback)
describe('clearProject', function () {
beforeEach(function (done) {
this.UrlCache.clearProject(this.project_id, done)
})
it('should clear the cache for each url in the project', function () {
return Array.from(this.urls).map(url =>
this.UrlCache._clearUrlFromCache
.calledWith(this.project_id, url)
.should.equal(true)
)
})
return it('should call the callback', function () {
return this.callback.called.should.equal(true)
it('should clear the cache in bulk', function () {
expect(
this.fse.remove.calledWith('/cache/dir/' + this.project_id)
).to.equal(true)
})
})
})

View file

@ -23,7 +23,10 @@ describe('UrlFetcher', function () {
request: {
defaults: (this.defaults = sinon.stub().returns((this.request = {}))),
},
fs: (this.fs = {}),
fs: (this.fs = {
rename: sinon.stub().yields(),
unlink: sinon.stub().yields(),
}),
'@overleaf/settings': (this.settings = {
apis: {
clsiPerf: {
@ -162,9 +165,15 @@ describe('UrlFetcher', function () {
.should.equal(true)
})
it('should open the file for writing', function () {
it('should open the atomic file for writing', function () {
return this.fs.createWriteStream
.calledWith(this.path)
.calledWith(this.path + '~')
.should.equal(true)
})
it('should move the atomic file to the target', function () {
return this.fs.rename
.calledWith(this.path + '~', this.path)
.should.equal(true)
})