2024-11-08 10:21:56 +00:00
|
|
|
const { callbackify } = require('node:util')
|
2020-07-31 15:20:07 +00:00
|
|
|
const MongoManager = require('./MongoManager').promises
|
2020-02-16 14:02:21 +00:00
|
|
|
const Errors = require('./Errors')
|
2021-12-14 13:00:35 +00:00
|
|
|
const logger = require('@overleaf/logger')
|
2022-04-26 11:16:36 +00:00
|
|
|
const Settings = require('@overleaf/settings')
|
2024-11-08 10:21:56 +00:00
|
|
|
const crypto = require('node:crypto')
|
2023-06-01 11:38:45 +00:00
|
|
|
const { ReadableString } = require('@overleaf/stream-utils')
|
2020-02-16 14:02:21 +00:00
|
|
|
const RangeManager = require('./RangeManager')
|
2020-07-23 18:42:49 +00:00
|
|
|
const PersistorManager = require('./PersistorManager')
|
2020-09-14 15:34:34 +00:00
|
|
|
const pMap = require('p-map')
|
2024-07-30 15:55:25 +00:00
|
|
|
const { BSON } = require('mongodb-legacy')
|
2022-09-26 08:39:26 +00:00
|
|
|
|
2022-04-26 11:16:36 +00:00
|
|
|
const PARALLEL_JOBS = Settings.parallelArchiveJobs
|
|
|
|
const UN_ARCHIVE_BATCH_SIZE = Settings.unArchiveBatchSize
|
2015-06-02 18:55:22 +00:00
|
|
|
|
2020-07-23 18:42:49 +00:00
|
|
|
module.exports = {
|
|
|
|
archiveAllDocs: callbackify(archiveAllDocs),
|
|
|
|
archiveDoc: callbackify(archiveDoc),
|
|
|
|
unArchiveAllDocs: callbackify(unArchiveAllDocs),
|
|
|
|
unarchiveDoc: callbackify(unarchiveDoc),
|
2022-04-26 11:16:36 +00:00
|
|
|
destroyProject: callbackify(destroyProject),
|
2021-07-30 15:03:43 +00:00
|
|
|
getDoc: callbackify(getDoc),
|
2020-07-23 18:42:49 +00:00
|
|
|
promises: {
|
|
|
|
archiveAllDocs,
|
|
|
|
archiveDoc,
|
|
|
|
unArchiveAllDocs,
|
|
|
|
unarchiveDoc,
|
2022-04-26 11:16:36 +00:00
|
|
|
destroyProject,
|
2021-07-30 15:03:43 +00:00
|
|
|
getDoc,
|
2021-07-13 11:04:48 +00:00
|
|
|
},
|
2020-07-23 18:42:49 +00:00
|
|
|
}
|
2015-06-02 18:55:22 +00:00
|
|
|
|
2020-07-23 18:42:49 +00:00
|
|
|
async function archiveAllDocs(projectId) {
|
2023-05-08 09:45:08 +00:00
|
|
|
if (!_isArchivingEnabled()) {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2022-05-17 11:49:13 +00:00
|
|
|
const docIds = await MongoManager.getNonArchivedProjectDocIds(projectId)
|
|
|
|
await pMap(docIds, docId => archiveDoc(projectId, docId), {
|
|
|
|
concurrency: PARALLEL_JOBS,
|
|
|
|
})
|
2020-07-23 18:42:49 +00:00
|
|
|
}
|
2015-06-02 18:55:22 +00:00
|
|
|
|
2022-05-17 11:49:13 +00:00
|
|
|
async function archiveDoc(projectId, docId) {
|
2023-05-08 09:45:08 +00:00
|
|
|
if (!_isArchivingEnabled()) {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2022-05-17 11:49:13 +00:00
|
|
|
const doc = await MongoManager.getDocForArchiving(projectId, docId)
|
2021-01-04 11:55:47 +00:00
|
|
|
|
|
|
|
if (!doc) {
|
2022-05-17 11:49:13 +00:00
|
|
|
// The doc wasn't found, it was already archived, or the lock couldn't be
|
|
|
|
// acquired. Since we don't know which it is, silently return.
|
2022-04-26 11:16:36 +00:00
|
|
|
return
|
|
|
|
}
|
2021-01-04 11:55:47 +00:00
|
|
|
|
2023-03-16 11:58:12 +00:00
|
|
|
logger.debug({ projectId, docId: doc._id }, 'sending doc to persistor')
|
2020-07-23 18:42:49 +00:00
|
|
|
const key = `${projectId}/${doc._id}`
|
2019-07-02 11:45:54 +00:00
|
|
|
|
2020-07-23 18:42:49 +00:00
|
|
|
if (doc.lines == null) {
|
|
|
|
throw new Error('doc has no lines')
|
|
|
|
}
|
2019-07-02 11:45:54 +00:00
|
|
|
|
2022-09-26 08:39:26 +00:00
|
|
|
// warn about any oversized docs already in mongo
|
|
|
|
const linesSize = BSON.calculateObjectSize(doc.lines || {})
|
|
|
|
const rangesSize = BSON.calculateObjectSize(doc.ranges || {})
|
|
|
|
if (
|
|
|
|
linesSize > Settings.max_doc_length ||
|
|
|
|
rangesSize > Settings.max_doc_length
|
|
|
|
) {
|
|
|
|
logger.warn(
|
2023-03-16 11:58:12 +00:00
|
|
|
{ projectId, docId: doc._id, linesSize, rangesSize },
|
2022-09-26 08:39:26 +00:00
|
|
|
'large doc found when archiving project'
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
2020-07-23 18:42:49 +00:00
|
|
|
const json = JSON.stringify({
|
|
|
|
lines: doc.lines,
|
|
|
|
ranges: doc.ranges,
|
2022-04-26 11:16:36 +00:00
|
|
|
rev: doc.rev,
|
2021-07-13 11:04:48 +00:00
|
|
|
schema_v: 1,
|
2020-07-23 18:42:49 +00:00
|
|
|
})
|
2019-07-02 11:45:54 +00:00
|
|
|
|
2020-07-23 18:42:49 +00:00
|
|
|
// this should never happen, but protects against memory-corruption errors that
|
|
|
|
// have happened in the past
|
|
|
|
if (json.indexOf('\u0000') > -1) {
|
|
|
|
const error = new Error('null bytes detected')
|
|
|
|
logger.err({ err: error, doc }, error.message)
|
|
|
|
throw error
|
|
|
|
}
|
|
|
|
|
|
|
|
const md5 = crypto.createHash('md5').update(json).digest('hex')
|
2023-06-01 11:38:45 +00:00
|
|
|
const stream = new ReadableString(json)
|
2022-04-26 11:16:36 +00:00
|
|
|
await PersistorManager.sendStream(Settings.docstore.bucket, key, stream, {
|
2021-07-13 11:04:48 +00:00
|
|
|
sourceMd5: md5,
|
2020-07-23 18:42:49 +00:00
|
|
|
})
|
2022-05-17 11:49:13 +00:00
|
|
|
await MongoManager.markDocAsArchived(projectId, docId, doc.rev)
|
2020-07-23 18:42:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
async function unArchiveAllDocs(projectId) {
|
2023-05-08 09:45:08 +00:00
|
|
|
if (!_isArchivingEnabled()) {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2021-06-08 11:59:13 +00:00
|
|
|
while (true) {
|
|
|
|
let docs
|
2022-04-26 11:16:36 +00:00
|
|
|
if (Settings.docstore.keepSoftDeletedDocsArchived) {
|
2021-06-08 18:29:10 +00:00
|
|
|
docs = await MongoManager.getNonDeletedArchivedProjectDocs(
|
2021-06-08 11:59:13 +00:00
|
|
|
projectId,
|
|
|
|
UN_ARCHIVE_BATCH_SIZE
|
|
|
|
)
|
|
|
|
} else {
|
2021-06-08 18:29:10 +00:00
|
|
|
docs = await MongoManager.getArchivedProjectDocs(
|
2021-06-08 11:59:13 +00:00
|
|
|
projectId,
|
|
|
|
UN_ARCHIVE_BATCH_SIZE
|
|
|
|
)
|
|
|
|
}
|
|
|
|
if (!docs || docs.length === 0) {
|
|
|
|
break
|
|
|
|
}
|
2021-07-13 11:04:48 +00:00
|
|
|
await pMap(docs, doc => unarchiveDoc(projectId, doc._id), {
|
|
|
|
concurrency: PARALLEL_JOBS,
|
2021-06-08 11:59:13 +00:00
|
|
|
})
|
2020-07-23 18:42:49 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-07-30 15:03:43 +00:00
|
|
|
// get the doc from the PersistorManager without storing it in mongo
|
|
|
|
async function getDoc(projectId, docId) {
|
2020-07-23 18:42:49 +00:00
|
|
|
const key = `${projectId}/${docId}`
|
2021-07-30 15:03:43 +00:00
|
|
|
const sourceMd5 = await PersistorManager.getObjectMd5Hash(
|
2022-04-26 11:16:36 +00:00
|
|
|
Settings.docstore.bucket,
|
2021-07-30 15:03:43 +00:00
|
|
|
key
|
|
|
|
)
|
|
|
|
const stream = await PersistorManager.getObjectStream(
|
2022-04-26 11:16:36 +00:00
|
|
|
Settings.docstore.bucket,
|
2021-07-30 15:03:43 +00:00
|
|
|
key
|
|
|
|
)
|
2020-07-23 18:42:49 +00:00
|
|
|
stream.resume()
|
2023-03-23 14:22:55 +00:00
|
|
|
const buffer = await _streamToBuffer(projectId, docId, stream)
|
2021-11-29 10:04:35 +00:00
|
|
|
const md5 = crypto.createHash('md5').update(buffer).digest('hex')
|
2020-07-23 18:42:49 +00:00
|
|
|
if (sourceMd5 !== md5) {
|
|
|
|
throw new Errors.Md5MismatchError('md5 mismatch when downloading doc', {
|
|
|
|
key,
|
|
|
|
sourceMd5,
|
2021-07-13 11:04:48 +00:00
|
|
|
md5,
|
2020-02-16 14:02:21 +00:00
|
|
|
})
|
2020-07-23 18:42:49 +00:00
|
|
|
}
|
2017-03-30 16:19:42 +00:00
|
|
|
|
2023-03-23 14:23:06 +00:00
|
|
|
return _deserializeArchivedDoc(buffer)
|
2021-07-30 15:03:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// get the doc and unarchive it to mongo
|
|
|
|
async function unarchiveDoc(projectId, docId) {
|
2022-05-16 12:38:18 +00:00
|
|
|
logger.debug({ projectId, docId }, 'getting doc from persistor')
|
2022-04-26 11:16:36 +00:00
|
|
|
const mongoDoc = await MongoManager.findDoc(projectId, docId, {
|
|
|
|
inS3: 1,
|
|
|
|
rev: 1,
|
|
|
|
})
|
|
|
|
if (!mongoDoc.inS3) {
|
|
|
|
// The doc is already unarchived
|
2021-07-30 15:03:43 +00:00
|
|
|
return
|
|
|
|
}
|
2023-05-08 09:45:08 +00:00
|
|
|
|
|
|
|
if (!_isArchivingEnabled()) {
|
|
|
|
throw new Error(
|
|
|
|
'found archived doc, but archiving backend is not configured'
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
2022-04-26 11:16:36 +00:00
|
|
|
const archivedDoc = await getDoc(projectId, docId)
|
|
|
|
if (archivedDoc.rev == null) {
|
|
|
|
// Older archived docs didn't have a rev. Assume that the rev of the
|
|
|
|
// archived doc is the rev that was stored in Mongo when we retrieved it
|
|
|
|
// earlier.
|
|
|
|
archivedDoc.rev = mongoDoc.rev
|
2021-07-30 15:03:43 +00:00
|
|
|
}
|
2022-04-26 11:16:36 +00:00
|
|
|
await MongoManager.restoreArchivedDoc(projectId, docId, archivedDoc)
|
2020-07-23 18:42:49 +00:00
|
|
|
}
|
|
|
|
|
2022-04-26 11:16:36 +00:00
|
|
|
async function destroyProject(projectId) {
|
|
|
|
const tasks = [MongoManager.destroyProject(projectId)]
|
|
|
|
if (_isArchivingEnabled()) {
|
|
|
|
tasks.push(
|
|
|
|
PersistorManager.deleteDirectory(Settings.docstore.bucket, projectId)
|
2021-04-16 08:27:09 +00:00
|
|
|
)
|
2020-07-23 18:42:49 +00:00
|
|
|
}
|
2022-04-26 11:16:36 +00:00
|
|
|
await Promise.all(tasks)
|
2020-07-23 18:42:49 +00:00
|
|
|
}
|
|
|
|
|
2023-03-23 14:22:55 +00:00
|
|
|
async function _streamToBuffer(projectId, docId, stream) {
|
2022-04-26 11:16:36 +00:00
|
|
|
const chunks = []
|
2023-03-23 15:07:51 +00:00
|
|
|
let size = 0
|
2023-03-23 14:22:55 +00:00
|
|
|
let logged = false
|
|
|
|
const logIfTooLarge = finishedReading => {
|
|
|
|
if (size <= Settings.max_doc_length) return
|
|
|
|
// Log progress once and then again at the end.
|
|
|
|
if (logged && !finishedReading) return
|
|
|
|
logger.warn(
|
|
|
|
{ projectId, docId, size, finishedReading },
|
|
|
|
'potentially large doc pulled down from gcs'
|
|
|
|
)
|
|
|
|
logged = true
|
|
|
|
}
|
2024-05-22 09:37:08 +00:00
|
|
|
return await new Promise((resolve, reject) => {
|
2023-03-23 14:22:55 +00:00
|
|
|
stream.on('data', chunk => {
|
|
|
|
size += chunk.byteLength
|
|
|
|
logIfTooLarge(false)
|
|
|
|
chunks.push(chunk)
|
|
|
|
})
|
2022-04-26 11:16:36 +00:00
|
|
|
stream.on('error', reject)
|
2023-03-23 14:22:55 +00:00
|
|
|
stream.on('end', () => {
|
|
|
|
logIfTooLarge(true)
|
|
|
|
resolve(Buffer.concat(chunks))
|
|
|
|
})
|
2020-07-23 18:42:49 +00:00
|
|
|
})
|
2022-04-26 11:16:36 +00:00
|
|
|
}
|
|
|
|
|
2023-03-23 14:23:06 +00:00
|
|
|
function _deserializeArchivedDoc(buffer) {
|
|
|
|
const doc = JSON.parse(buffer)
|
2022-04-26 11:16:36 +00:00
|
|
|
|
|
|
|
const result = {}
|
|
|
|
if (doc.schema_v === 1 && doc.lines != null) {
|
|
|
|
result.lines = doc.lines
|
|
|
|
if (doc.ranges != null) {
|
|
|
|
result.ranges = RangeManager.jsonRangesToMongo(doc.ranges)
|
|
|
|
}
|
|
|
|
} else if (Array.isArray(doc)) {
|
|
|
|
result.lines = doc
|
|
|
|
} else {
|
|
|
|
throw new Error("I don't understand the doc format in s3")
|
2020-02-16 14:02:21 +00:00
|
|
|
}
|
2020-07-23 18:42:49 +00:00
|
|
|
|
2022-04-26 11:16:36 +00:00
|
|
|
if (doc.rev != null) {
|
|
|
|
result.rev = doc.rev
|
2020-07-23 18:42:49 +00:00
|
|
|
}
|
2022-04-26 11:16:36 +00:00
|
|
|
|
|
|
|
return result
|
2020-07-23 18:42:49 +00:00
|
|
|
}
|
|
|
|
|
2022-04-26 11:16:36 +00:00
|
|
|
function _isArchivingEnabled() {
|
|
|
|
const backend = Settings.docstore.backend
|
|
|
|
|
|
|
|
if (!backend) {
|
|
|
|
return false
|
2021-05-05 09:57:12 +00:00
|
|
|
}
|
|
|
|
|
2022-04-26 11:16:36 +00:00
|
|
|
// The default backend is S3. If another backend is configured or the S3
|
|
|
|
// backend itself is correctly configured, then archiving is enabled.
|
|
|
|
if (backend === 's3' && Settings.docstore.s3 == null) {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
return true
|
2020-02-16 14:02:21 +00:00
|
|
|
}
|