overleaf/services/docstore/app/js/DocArchiveManager.js

262 lines
6.9 KiB
JavaScript
Raw Normal View History

2020-07-31 11:20:07 -04:00
const { callbackify } = require('util')
const MongoManager = require('./MongoManager').promises
const Errors = require('./Errors')
const logger = require('logger-sharelatex')
const settings = require('@overleaf/settings')
const crypto = require('crypto')
2020-07-23 14:42:49 -04:00
const Streamifier = require('streamifier')
const RangeManager = require('./RangeManager')
2020-07-23 14:42:49 -04:00
const PersistorManager = require('./PersistorManager')
2020-09-14 11:34:34 -04:00
const pMap = require('p-map')
2015-06-02 14:55:22 -04:00
const PARALLEL_JOBS = settings.parallelArchiveJobs
const ARCHIVE_BATCH_SIZE = settings.archiveBatchSize
const UN_ARCHIVE_BATCH_SIZE = settings.unArchiveBatchSize
const DESTROY_BATCH_SIZE = settings.destroyBatchSize
const DESTROY_RETRY_COUNT = settings.destroyRetryCount
2015-06-02 14:55:22 -04:00
2020-07-23 14:42:49 -04:00
module.exports = {
archiveAllDocs: callbackify(archiveAllDocs),
archiveDocById: callbackify(archiveDocById),
2020-07-23 14:42:49 -04:00
archiveDoc: callbackify(archiveDoc),
unArchiveAllDocs: callbackify(unArchiveAllDocs),
unarchiveDoc: callbackify(unarchiveDoc),
destroyAllDocs: callbackify(destroyAllDocs),
destroyDoc: callbackify(destroyDoc),
getDoc: callbackify(getDoc),
2020-07-23 14:42:49 -04:00
promises: {
archiveAllDocs,
archiveDocById,
2020-07-23 14:42:49 -04:00
archiveDoc,
unArchiveAllDocs,
unarchiveDoc,
destroyAllDocs,
2021-07-13 07:04:48 -04:00
destroyDoc,
getDoc,
2021-07-13 07:04:48 -04:00
},
2020-07-23 14:42:49 -04:00
}
2015-06-02 14:55:22 -04:00
2020-07-23 14:42:49 -04:00
async function archiveAllDocs(projectId) {
while (true) {
const docs = await MongoManager.getNonArchivedProjectDocs(
projectId,
ARCHIVE_BATCH_SIZE
)
if (!docs || docs.length === 0) {
break
}
2015-06-02 14:55:22 -04:00
2021-07-13 07:04:48 -04:00
await pMap(docs, doc => archiveDoc(projectId, doc), {
concurrency: PARALLEL_JOBS,
})
2020-07-23 14:42:49 -04:00
}
}
2015-06-02 14:55:22 -04:00
async function archiveDocById(projectId, docId) {
const doc = await MongoManager.findDoc(projectId, docId, {
lines: true,
ranges: true,
rev: true,
2021-07-13 07:04:48 -04:00
inS3: true,
})
if (!doc) {
throw new Errors.NotFoundError(
`Cannot find doc ${docId} in project ${projectId}`
)
}
// TODO(das7pad): consider refactoring MongoManager.findDoc to take a query
if (doc.inS3) return
return archiveDoc(projectId, doc)
}
2020-07-23 14:42:49 -04:00
async function archiveDoc(projectId, doc) {
logger.log(
{ project_id: projectId, doc_id: doc._id },
'sending doc to persistor'
)
const key = `${projectId}/${doc._id}`
2020-07-23 14:42:49 -04:00
if (doc.lines == null) {
throw new Error('doc has no lines')
}
2020-07-23 14:42:49 -04:00
const json = JSON.stringify({
lines: doc.lines,
ranges: doc.ranges,
2021-07-13 07:04:48 -04:00
schema_v: 1,
2020-07-23 14:42:49 -04:00
})
2020-07-23 14:42:49 -04:00
// this should never happen, but protects against memory-corruption errors that
// have happened in the past
if (json.indexOf('\u0000') > -1) {
const error = new Error('null bytes detected')
logger.err({ err: error, doc }, error.message)
throw error
}
const md5 = crypto.createHash('md5').update(json).digest('hex')
const stream = Streamifier.createReadStream(json)
await PersistorManager.sendStream(settings.docstore.bucket, key, stream, {
2021-07-13 07:04:48 -04:00
sourceMd5: md5,
2020-07-23 14:42:49 -04:00
})
2020-07-31 11:20:07 -04:00
await MongoManager.markDocAsArchived(doc._id, doc.rev)
2020-07-23 14:42:49 -04:00
}
async function unArchiveAllDocs(projectId) {
while (true) {
let docs
if (settings.docstore.keepSoftDeletedDocsArchived) {
docs = await MongoManager.getNonDeletedArchivedProjectDocs(
projectId,
UN_ARCHIVE_BATCH_SIZE
)
} else {
docs = await MongoManager.getArchivedProjectDocs(
projectId,
UN_ARCHIVE_BATCH_SIZE
)
}
if (!docs || docs.length === 0) {
break
}
2021-07-13 07:04:48 -04:00
await pMap(docs, doc => unarchiveDoc(projectId, doc._id), {
concurrency: PARALLEL_JOBS,
})
2020-07-23 14:42:49 -04:00
}
}
// get the doc from the PersistorManager without storing it in mongo
async function getDoc(projectId, docId) {
2020-07-23 14:42:49 -04:00
const key = `${projectId}/${docId}`
const sourceMd5 = await PersistorManager.getObjectMd5Hash(
settings.docstore.bucket,
key
)
const stream = await PersistorManager.getObjectStream(
settings.docstore.bucket,
key
)
2020-07-23 14:42:49 -04:00
stream.resume()
const json = await _streamToString(stream)
const md5 = crypto.createHash('md5').update(json).digest('hex')
if (sourceMd5 !== md5) {
throw new Errors.Md5MismatchError('md5 mismatch when downloading doc', {
key,
sourceMd5,
2021-07-13 07:04:48 -04:00
md5,
})
2020-07-23 14:42:49 -04:00
}
2020-07-23 14:42:49 -04:00
const doc = JSON.parse(json)
const mongoDoc = {}
if (doc.schema_v === 1 && doc.lines != null) {
mongoDoc.lines = doc.lines
if (doc.ranges != null) {
mongoDoc.ranges = RangeManager.jsonRangesToMongo(doc.ranges)
}
2020-07-23 14:42:49 -04:00
} else if (Array.isArray(doc)) {
mongoDoc.lines = doc
} else {
throw new Error("I don't understand the doc format in s3")
}
return mongoDoc
}
// get the doc and unarchive it to mongo
async function unarchiveDoc(projectId, docId) {
logger.log(
{ project_id: projectId, doc_id: docId },
'getting doc from persistor'
)
const key = `${projectId}/${docId}`
const originalDoc = await MongoManager.findDoc(projectId, docId, { inS3: 1 })
if (!originalDoc.inS3) {
// return if it's not actually in S3 as there's nothing to do
return
}
let mongoDoc
try {
mongoDoc = await getDoc(projectId, docId)
} catch (err) {
// if we get a 404, we could be in a race and something else has unarchived the doc already
if (err instanceof Errors.NotFoundError) {
const doc = await MongoManager.findDoc(projectId, docId, { inS3: 1 })
if (!doc.inS3) {
// the doc has been archived while we were looking for it, so no error
return
}
}
throw err
}
2020-07-31 11:20:07 -04:00
await MongoManager.upsertIntoDocCollection(projectId, docId, mongoDoc)
2020-07-23 14:42:49 -04:00
await PersistorManager.deleteObject(settings.docstore.bucket, key)
}
async function destroyAllDocs(projectId) {
while (true) {
const docs = await MongoManager.getProjectsDocs(
projectId,
{ include_deleted: true, limit: DESTROY_BATCH_SIZE },
{ _id: 1 }
)
if (!docs || docs.length === 0) {
break
}
2021-07-13 07:04:48 -04:00
await pMap(docs, doc => destroyDoc(projectId, doc._id), {
concurrency: PARALLEL_JOBS,
2020-09-14 11:34:34 -04:00
})
2020-07-23 14:42:49 -04:00
}
}
async function destroyDoc(projectId, docId) {
logger.log(
{ project_id: projectId, doc_id: docId },
'removing doc from mongo and persistor'
)
2020-07-31 11:20:07 -04:00
const doc = await MongoManager.findDoc(projectId, docId, {
2021-07-13 07:04:48 -04:00
inS3: 1,
2020-07-23 14:42:49 -04:00
})
if (!doc) {
throw new Errors.NotFoundError('Doc not found in Mongo')
}
2020-07-23 14:42:49 -04:00
if (doc.inS3) {
await destroyArchiveWithRetry(projectId, docId)
2020-07-23 14:42:49 -04:00
}
2020-07-31 11:20:07 -04:00
await MongoManager.destroyDoc(docId)
2020-07-23 14:42:49 -04:00
}
async function destroyArchiveWithRetry(projectId, docId) {
let attempt = 0
let lastError
while (attempt++ <= DESTROY_RETRY_COUNT) {
try {
await PersistorManager.deleteObject(
settings.docstore.bucket,
`${projectId}/${docId}`
)
return
} catch (err) {
lastError = err
logger.warn(
{ projectId, docId, err, attempt },
'destroying archive failed'
)
}
}
throw lastError
}
2020-07-23 14:42:49 -04:00
async function _streamToString(stream) {
const chunks = []
return new Promise((resolve, reject) => {
2021-07-13 07:04:48 -04:00
stream.on('data', chunk => chunks.push(chunk))
2020-07-23 14:42:49 -04:00
stream.on('error', reject)
stream.on('end', () => resolve(Buffer.concat(chunks).toString('utf8')))
})
}