mirror of
https://github.com/overleaf/overleaf.git
synced 2024-11-29 05:03:33 -05:00
Add script to migrate data from S3
This commit is contained in:
parent
ed0426478c
commit
f9862081b7
1 changed files with 123 additions and 0 deletions
123
services/docstore/scripts/rearchive-all-docs.js
Normal file
123
services/docstore/scripts/rearchive-all-docs.js
Normal file
|
@ -0,0 +1,123 @@
|
||||||
|
const DocArchiveManager = require('../app/js/DocArchiveManager').promises
|
||||||
|
const MongoManager = require('../app/js/MongoManager').promises
|
||||||
|
const { getCollection, ObjectId } = require('../app/js/mongodb')
|
||||||
|
const minimist = require('minimist')
|
||||||
|
|
||||||
|
async function worker(projectId) {
|
||||||
|
try {
|
||||||
|
// see if the project needs to be unarchived, and unarchive it
|
||||||
|
const archivedDocs = await MongoManager.getArchivedProjectDocs(projectId)
|
||||||
|
if (archivedDocs.length) {
|
||||||
|
await DocArchiveManager.unArchiveAllDocs(projectId)
|
||||||
|
}
|
||||||
|
|
||||||
|
// get the doc content so we can validate it
|
||||||
|
const docs = await MongoManager.getProjectsDocs(
|
||||||
|
projectId,
|
||||||
|
{ include_deleted: false },
|
||||||
|
{ lines: 1 }
|
||||||
|
)
|
||||||
|
|
||||||
|
// start archiving in the background while we check the content, if it was archived to begin with
|
||||||
|
let archivePromise
|
||||||
|
if (archivedDocs.length) {
|
||||||
|
archivePromise = DocArchiveManager.archiveAllDocs(projectId)
|
||||||
|
}
|
||||||
|
|
||||||
|
let warning = false
|
||||||
|
|
||||||
|
// validate the doc contents and log any warnings to investigate later
|
||||||
|
for (const doc of docs) {
|
||||||
|
if (!doc.lines) {
|
||||||
|
warning = true
|
||||||
|
console.error('WARN:', projectId, doc._id, 'has no content')
|
||||||
|
}
|
||||||
|
// eslint-disable-next-line no-control-regex
|
||||||
|
if (doc.lines && doc.lines.some((line) => line.match(/(\r|\u0000)/))) {
|
||||||
|
warning = true
|
||||||
|
console.error('WARN:', projectId, doc._id, 'has invalid characters')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ensure the archive process has finished
|
||||||
|
if (archivePromise) {
|
||||||
|
await archivePromise
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!warning) {
|
||||||
|
// log to stderr along with the other output
|
||||||
|
console.error('OK:', projectId)
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
console.error('ERROR:', projectId, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function rearchiveAllDocs() {
|
||||||
|
const params = minimist(process.argv.slice(2))
|
||||||
|
const maxWorkers = params.w || 1
|
||||||
|
console.log(`Starting with ${maxWorkers} workers`)
|
||||||
|
|
||||||
|
// start from an objectId and run in ascending order, so we can resume later
|
||||||
|
const query = {}
|
||||||
|
const startId = params._[0]
|
||||||
|
if (startId) {
|
||||||
|
const validator = new RegExp('^[0-9a-fA-F]{24}$')
|
||||||
|
if (!validator.test(startId)) {
|
||||||
|
console.error('Invalid object id')
|
||||||
|
return
|
||||||
|
}
|
||||||
|
query._id = {
|
||||||
|
$gt: ObjectId(startId)
|
||||||
|
}
|
||||||
|
console.log(`Starting from object ID ${startId}`)
|
||||||
|
} else {
|
||||||
|
console.log('No object id specified. Starting from the beginning.')
|
||||||
|
}
|
||||||
|
|
||||||
|
const results = (await getCollection('projects'))
|
||||||
|
.find(query, { _id: 1 })
|
||||||
|
.sort({ _id: 1 })
|
||||||
|
let jobCount = 0
|
||||||
|
|
||||||
|
// keep going until we run out of projects
|
||||||
|
while (true) {
|
||||||
|
// get a new project to run a job with
|
||||||
|
const project = await results.next()
|
||||||
|
// if there are no more projects, wait until all the jobs have finished and exit
|
||||||
|
if (!project) {
|
||||||
|
// eslint-disable-next-line no-unmodified-loop-condition
|
||||||
|
while (jobCount) {
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 50))
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// wait until there are fewer than maxWorkers jobs running
|
||||||
|
// eslint-disable-next-line no-unmodified-loop-condition
|
||||||
|
while (jobCount >= maxWorkers) {
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 50))
|
||||||
|
}
|
||||||
|
|
||||||
|
// start a new job in the background and then continue the loop
|
||||||
|
++jobCount
|
||||||
|
worker(project._id)
|
||||||
|
.then(() => --jobCount)
|
||||||
|
.catch(() => {
|
||||||
|
console.error('ERROR:', project._id)
|
||||||
|
--jobCount
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!module.parent) {
|
||||||
|
rearchiveAllDocs()
|
||||||
|
.then(() => {
|
||||||
|
console.log('Finished!')
|
||||||
|
process.exit(0)
|
||||||
|
})
|
||||||
|
.catch((err) => {
|
||||||
|
console.error('Something went wrong:', err)
|
||||||
|
process.exit(1)
|
||||||
|
})
|
||||||
|
}
|
Loading…
Reference in a new issue