2020-07-31 11:20:07 -04:00
|
|
|
const { callbackify } = require('util')
|
|
|
|
const MongoManager = require('./MongoManager').promises
|
2020-02-16 09:02:21 -05:00
|
|
|
const Errors = require('./Errors')
|
2021-12-14 08:00:35 -05:00
|
|
|
const logger = require('@overleaf/logger')
|
2021-07-12 12:47:20 -04:00
|
|
|
const settings = require('@overleaf/settings')
|
2020-02-16 09:02:21 -05:00
|
|
|
const crypto = require('crypto')
|
2020-07-23 14:42:49 -04:00
|
|
|
const Streamifier = require('streamifier')
|
2020-02-16 09:02:21 -05:00
|
|
|
const RangeManager = require('./RangeManager')
|
2020-07-23 14:42:49 -04:00
|
|
|
const PersistorManager = require('./PersistorManager')
|
2020-09-14 11:34:34 -04:00
|
|
|
const pMap = require('p-map')
|
2015-06-02 14:55:22 -04:00
|
|
|
|
2021-04-16 04:27:09 -04:00
|
|
|
const PARALLEL_JOBS = settings.parallelArchiveJobs
|
2021-06-08 07:59:13 -04:00
|
|
|
const ARCHIVE_BATCH_SIZE = settings.archiveBatchSize
|
|
|
|
const UN_ARCHIVE_BATCH_SIZE = settings.unArchiveBatchSize
|
2021-04-16 04:27:09 -04:00
|
|
|
const DESTROY_BATCH_SIZE = settings.destroyBatchSize
|
2021-05-05 05:57:12 -04:00
|
|
|
const DESTROY_RETRY_COUNT = settings.destroyRetryCount
|
2015-06-02 14:55:22 -04:00
|
|
|
|
2020-07-23 14:42:49 -04:00
|
|
|
module.exports = {
|
|
|
|
archiveAllDocs: callbackify(archiveAllDocs),
|
2021-01-04 06:55:47 -05:00
|
|
|
archiveDocById: callbackify(archiveDocById),
|
2020-07-23 14:42:49 -04:00
|
|
|
archiveDoc: callbackify(archiveDoc),
|
|
|
|
unArchiveAllDocs: callbackify(unArchiveAllDocs),
|
|
|
|
unarchiveDoc: callbackify(unarchiveDoc),
|
|
|
|
destroyAllDocs: callbackify(destroyAllDocs),
|
|
|
|
destroyDoc: callbackify(destroyDoc),
|
2021-07-30 11:03:43 -04:00
|
|
|
getDoc: callbackify(getDoc),
|
2020-07-23 14:42:49 -04:00
|
|
|
promises: {
|
|
|
|
archiveAllDocs,
|
2021-01-04 06:55:47 -05:00
|
|
|
archiveDocById,
|
2020-07-23 14:42:49 -04:00
|
|
|
archiveDoc,
|
|
|
|
unArchiveAllDocs,
|
|
|
|
unarchiveDoc,
|
|
|
|
destroyAllDocs,
|
2021-07-13 07:04:48 -04:00
|
|
|
destroyDoc,
|
2021-07-30 11:03:43 -04:00
|
|
|
getDoc,
|
2021-07-13 07:04:48 -04:00
|
|
|
},
|
2020-07-23 14:42:49 -04:00
|
|
|
}
|
2015-06-02 14:55:22 -04:00
|
|
|
|
2020-07-23 14:42:49 -04:00
|
|
|
async function archiveAllDocs(projectId) {
|
2021-06-08 07:59:13 -04:00
|
|
|
while (true) {
|
2021-06-08 14:29:10 -04:00
|
|
|
const docs = await MongoManager.getNonArchivedProjectDocs(
|
2021-06-08 07:59:13 -04:00
|
|
|
projectId,
|
|
|
|
ARCHIVE_BATCH_SIZE
|
|
|
|
)
|
|
|
|
if (!docs || docs.length === 0) {
|
|
|
|
break
|
|
|
|
}
|
2015-06-02 14:55:22 -04:00
|
|
|
|
2021-07-13 07:04:48 -04:00
|
|
|
await pMap(docs, doc => archiveDoc(projectId, doc), {
|
|
|
|
concurrency: PARALLEL_JOBS,
|
2021-06-08 07:59:13 -04:00
|
|
|
})
|
2020-07-23 14:42:49 -04:00
|
|
|
}
|
|
|
|
}
|
2015-06-02 14:55:22 -04:00
|
|
|
|
2021-01-04 06:55:47 -05:00
|
|
|
async function archiveDocById(projectId, docId) {
|
|
|
|
const doc = await MongoManager.findDoc(projectId, docId, {
|
|
|
|
lines: true,
|
|
|
|
ranges: true,
|
|
|
|
rev: true,
|
2021-07-13 07:04:48 -04:00
|
|
|
inS3: true,
|
2021-01-04 06:55:47 -05:00
|
|
|
})
|
|
|
|
|
|
|
|
if (!doc) {
|
|
|
|
throw new Errors.NotFoundError(
|
|
|
|
`Cannot find doc ${docId} in project ${projectId}`
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO(das7pad): consider refactoring MongoManager.findDoc to take a query
|
|
|
|
if (doc.inS3) return
|
|
|
|
return archiveDoc(projectId, doc)
|
|
|
|
}
|
|
|
|
|
2020-07-23 14:42:49 -04:00
|
|
|
async function archiveDoc(projectId, doc) {
|
|
|
|
logger.log(
|
|
|
|
{ project_id: projectId, doc_id: doc._id },
|
|
|
|
'sending doc to persistor'
|
|
|
|
)
|
|
|
|
const key = `${projectId}/${doc._id}`
|
2019-07-02 07:45:54 -04:00
|
|
|
|
2020-07-23 14:42:49 -04:00
|
|
|
if (doc.lines == null) {
|
|
|
|
throw new Error('doc has no lines')
|
|
|
|
}
|
2019-07-02 07:45:54 -04:00
|
|
|
|
2020-07-23 14:42:49 -04:00
|
|
|
const json = JSON.stringify({
|
|
|
|
lines: doc.lines,
|
|
|
|
ranges: doc.ranges,
|
2021-07-13 07:04:48 -04:00
|
|
|
schema_v: 1,
|
2020-07-23 14:42:49 -04:00
|
|
|
})
|
2019-07-02 07:45:54 -04:00
|
|
|
|
2020-07-23 14:42:49 -04:00
|
|
|
// this should never happen, but protects against memory-corruption errors that
|
|
|
|
// have happened in the past
|
|
|
|
if (json.indexOf('\u0000') > -1) {
|
|
|
|
const error = new Error('null bytes detected')
|
|
|
|
logger.err({ err: error, doc }, error.message)
|
|
|
|
throw error
|
|
|
|
}
|
|
|
|
|
|
|
|
const md5 = crypto.createHash('md5').update(json).digest('hex')
|
|
|
|
const stream = Streamifier.createReadStream(json)
|
|
|
|
await PersistorManager.sendStream(settings.docstore.bucket, key, stream, {
|
2021-07-13 07:04:48 -04:00
|
|
|
sourceMd5: md5,
|
2020-07-23 14:42:49 -04:00
|
|
|
})
|
2020-07-31 11:20:07 -04:00
|
|
|
await MongoManager.markDocAsArchived(doc._id, doc.rev)
|
2020-07-23 14:42:49 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
async function unArchiveAllDocs(projectId) {
|
2021-06-08 07:59:13 -04:00
|
|
|
while (true) {
|
|
|
|
let docs
|
|
|
|
if (settings.docstore.keepSoftDeletedDocsArchived) {
|
2021-06-08 14:29:10 -04:00
|
|
|
docs = await MongoManager.getNonDeletedArchivedProjectDocs(
|
2021-06-08 07:59:13 -04:00
|
|
|
projectId,
|
|
|
|
UN_ARCHIVE_BATCH_SIZE
|
|
|
|
)
|
|
|
|
} else {
|
2021-06-08 14:29:10 -04:00
|
|
|
docs = await MongoManager.getArchivedProjectDocs(
|
2021-06-08 07:59:13 -04:00
|
|
|
projectId,
|
|
|
|
UN_ARCHIVE_BATCH_SIZE
|
|
|
|
)
|
|
|
|
}
|
|
|
|
if (!docs || docs.length === 0) {
|
|
|
|
break
|
|
|
|
}
|
2021-07-13 07:04:48 -04:00
|
|
|
await pMap(docs, doc => unarchiveDoc(projectId, doc._id), {
|
|
|
|
concurrency: PARALLEL_JOBS,
|
2021-06-08 07:59:13 -04:00
|
|
|
})
|
2020-07-23 14:42:49 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-07-30 11:03:43 -04:00
|
|
|
// get the doc from the PersistorManager without storing it in mongo
|
|
|
|
async function getDoc(projectId, docId) {
|
2020-07-23 14:42:49 -04:00
|
|
|
const key = `${projectId}/${docId}`
|
2021-07-30 11:03:43 -04:00
|
|
|
const sourceMd5 = await PersistorManager.getObjectMd5Hash(
|
|
|
|
settings.docstore.bucket,
|
|
|
|
key
|
|
|
|
)
|
|
|
|
const stream = await PersistorManager.getObjectStream(
|
|
|
|
settings.docstore.bucket,
|
|
|
|
key
|
|
|
|
)
|
2020-07-23 14:42:49 -04:00
|
|
|
stream.resume()
|
2021-11-29 05:04:35 -05:00
|
|
|
const buffer = await _streamToBuffer(stream)
|
|
|
|
const md5 = crypto.createHash('md5').update(buffer).digest('hex')
|
2020-07-23 14:42:49 -04:00
|
|
|
if (sourceMd5 !== md5) {
|
|
|
|
throw new Errors.Md5MismatchError('md5 mismatch when downloading doc', {
|
|
|
|
key,
|
|
|
|
sourceMd5,
|
2021-07-13 07:04:48 -04:00
|
|
|
md5,
|
2020-02-16 09:02:21 -05:00
|
|
|
})
|
2020-07-23 14:42:49 -04:00
|
|
|
}
|
2017-03-30 12:19:42 -04:00
|
|
|
|
2021-11-29 05:04:35 -05:00
|
|
|
const json = buffer.toString()
|
2020-07-23 14:42:49 -04:00
|
|
|
const doc = JSON.parse(json)
|
|
|
|
|
|
|
|
const mongoDoc = {}
|
|
|
|
if (doc.schema_v === 1 && doc.lines != null) {
|
|
|
|
mongoDoc.lines = doc.lines
|
|
|
|
if (doc.ranges != null) {
|
|
|
|
mongoDoc.ranges = RangeManager.jsonRangesToMongo(doc.ranges)
|
2020-02-16 09:02:21 -05:00
|
|
|
}
|
2020-07-23 14:42:49 -04:00
|
|
|
} else if (Array.isArray(doc)) {
|
|
|
|
mongoDoc.lines = doc
|
|
|
|
} else {
|
|
|
|
throw new Error("I don't understand the doc format in s3")
|
|
|
|
}
|
2021-07-30 11:03:43 -04:00
|
|
|
|
|
|
|
return mongoDoc
|
|
|
|
}
|
|
|
|
|
|
|
|
// get the doc and unarchive it to mongo
|
|
|
|
async function unarchiveDoc(projectId, docId) {
|
|
|
|
logger.log(
|
|
|
|
{ project_id: projectId, doc_id: docId },
|
|
|
|
'getting doc from persistor'
|
|
|
|
)
|
|
|
|
const key = `${projectId}/${docId}`
|
|
|
|
const originalDoc = await MongoManager.findDoc(projectId, docId, { inS3: 1 })
|
|
|
|
if (!originalDoc.inS3) {
|
|
|
|
// return if it's not actually in S3 as there's nothing to do
|
|
|
|
return
|
|
|
|
}
|
|
|
|
let mongoDoc
|
|
|
|
try {
|
|
|
|
mongoDoc = await getDoc(projectId, docId)
|
|
|
|
} catch (err) {
|
|
|
|
// if we get a 404, we could be in a race and something else has unarchived the doc already
|
|
|
|
if (err instanceof Errors.NotFoundError) {
|
|
|
|
const doc = await MongoManager.findDoc(projectId, docId, { inS3: 1 })
|
|
|
|
if (!doc.inS3) {
|
|
|
|
// the doc has been archived while we were looking for it, so no error
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
throw err
|
|
|
|
}
|
2020-07-31 11:20:07 -04:00
|
|
|
await MongoManager.upsertIntoDocCollection(projectId, docId, mongoDoc)
|
2020-07-23 14:42:49 -04:00
|
|
|
await PersistorManager.deleteObject(settings.docstore.bucket, key)
|
|
|
|
}
|
|
|
|
|
|
|
|
async function destroyAllDocs(projectId) {
|
2021-04-16 04:27:09 -04:00
|
|
|
while (true) {
|
|
|
|
const docs = await MongoManager.getProjectsDocs(
|
|
|
|
projectId,
|
|
|
|
{ include_deleted: true, limit: DESTROY_BATCH_SIZE },
|
|
|
|
{ _id: 1 }
|
|
|
|
)
|
|
|
|
if (!docs || docs.length === 0) {
|
|
|
|
break
|
|
|
|
}
|
2021-07-13 07:04:48 -04:00
|
|
|
await pMap(docs, doc => destroyDoc(projectId, doc._id), {
|
|
|
|
concurrency: PARALLEL_JOBS,
|
2020-09-14 11:34:34 -04:00
|
|
|
})
|
2020-07-23 14:42:49 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
async function destroyDoc(projectId, docId) {
|
|
|
|
logger.log(
|
|
|
|
{ project_id: projectId, doc_id: docId },
|
|
|
|
'removing doc from mongo and persistor'
|
|
|
|
)
|
2020-07-31 11:20:07 -04:00
|
|
|
const doc = await MongoManager.findDoc(projectId, docId, {
|
2021-07-13 07:04:48 -04:00
|
|
|
inS3: 1,
|
2020-07-23 14:42:49 -04:00
|
|
|
})
|
|
|
|
if (!doc) {
|
|
|
|
throw new Errors.NotFoundError('Doc not found in Mongo')
|
2020-02-16 09:02:21 -05:00
|
|
|
}
|
2020-07-23 14:42:49 -04:00
|
|
|
|
|
|
|
if (doc.inS3) {
|
2021-05-05 05:57:12 -04:00
|
|
|
await destroyArchiveWithRetry(projectId, docId)
|
2020-07-23 14:42:49 -04:00
|
|
|
}
|
2020-07-31 11:20:07 -04:00
|
|
|
await MongoManager.destroyDoc(docId)
|
2020-07-23 14:42:49 -04:00
|
|
|
}
|
|
|
|
|
2021-05-05 05:57:12 -04:00
|
|
|
async function destroyArchiveWithRetry(projectId, docId) {
|
|
|
|
let attempt = 0
|
|
|
|
let lastError
|
|
|
|
while (attempt++ <= DESTROY_RETRY_COUNT) {
|
|
|
|
try {
|
|
|
|
await PersistorManager.deleteObject(
|
|
|
|
settings.docstore.bucket,
|
|
|
|
`${projectId}/${docId}`
|
|
|
|
)
|
|
|
|
return
|
|
|
|
} catch (err) {
|
|
|
|
lastError = err
|
|
|
|
logger.warn(
|
|
|
|
{ projectId, docId, err, attempt },
|
|
|
|
'destroying archive failed'
|
|
|
|
)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
throw lastError
|
|
|
|
}
|
|
|
|
|
2021-11-29 05:04:35 -05:00
|
|
|
async function _streamToBuffer(stream) {
|
2020-07-23 14:42:49 -04:00
|
|
|
const chunks = []
|
|
|
|
return new Promise((resolve, reject) => {
|
2021-07-13 07:04:48 -04:00
|
|
|
stream.on('data', chunk => chunks.push(chunk))
|
2020-07-23 14:42:49 -04:00
|
|
|
stream.on('error', reject)
|
2021-11-29 05:04:35 -05:00
|
|
|
stream.on('end', () => resolve(Buffer.concat(chunks)))
|
2020-07-23 14:42:49 -04:00
|
|
|
})
|
2020-02-16 09:02:21 -05:00
|
|
|
}
|