overleaf/services/docstore/scripts/rearchive-all-docs.js

const DocArchiveManager = require('../app/js/DocArchiveManager').promises
const MongoManager = require('../app/js/MongoManager').promises
const { getCollection, ObjectId } = require('../app/js/mongodb')
const minimist = require('minimist')

async function worker(projectId) {
  try {
    // see if the project needs to be unarchived, and unarchive it
    const archivedDocs = await MongoManager.getArchivedProjectDocs(projectId)
    if (archivedDocs.length) {
      await DocArchiveManager.unArchiveAllDocs(projectId)
    }

    // get the doc content so we can validate it
    const docs = await MongoManager.getProjectsDocs(
      projectId,
      { include_deleted: false },
      { lines: 1 }
    )

    // start archiving in the background while we check the content, if it was archived to begin with
    let archivePromise
    if (archivedDocs.length) {
      archivePromise = DocArchiveManager.archiveAllDocs(projectId)
    }

    let warning = false

    // validate the doc contents and log any warnings to investigate later
    for (const doc of docs) {
      if (!doc.lines) {
        warning = true
        console.error('WARN:', projectId, doc._id, 'has no content')
      }
      // eslint-disable-next-line no-control-regex
      if (doc.lines && doc.lines.some((line) => line.match(/(\r|\u0000)/))) {
        warning = true
        console.error('WARN:', projectId, doc._id, 'has invalid characters')
      }
    }

    // ensure the archive process has finished
    if (archivePromise) {
      await archivePromise
    }

    if (!warning) {
      // log to stderr along with the other output
      console.error('OK:', projectId)
    }
  } catch (err) {
    console.error('ERROR:', projectId, err)
  }
}

async function rearchiveAllDocs() {
  const params = minimist(process.argv.slice(2))
  const maxWorkers = params.w || 1
  console.log(`Starting with ${maxWorkers} workers`)

  // start from an objectId and run in ascending order, so we can resume later
  const query = {}
  const startId = params._[0]
  const endId = params.e
  if (startId) {
    if (!new RegExp('^[0-9a-fA-F]{24}$').test(startId)) {
      throw new Error('Invalid start object id')
    }
    query._id = {
      $gte: ObjectId(startId)
    }
    console.log(`Starting from object ID ${startId}`)
  } else {
    console.log('No object id specified. Starting from the beginning.')
  }

  if (endId) {
    if (!new RegExp('^[0-9a-fA-F]{24}$').test(endId)) {
      throw new Error('Invalid end object id')
    }
    query._id = query._id || {}
    query._id.$lte = ObjectId(endId)
    console.log(`Stopping at object ID ${endId}`)
  }

  const results = (await getCollection('projects'))
    .find(query, { _id: 1 })
    .sort({ _id: 1 })
  let jobCount = 0

  // keep going until we run out of projects
  while (true) {
    // get a new project to run a job with
    const project = await results.next()
    // if there are no more projects, wait until all the jobs have finished and exit
    if (!project) {
      // eslint-disable-next-line no-unmodified-loop-condition
      while (jobCount) {
        await new Promise((resolve) => setTimeout(resolve, 50))
      }
      return
    }

    // wait until there are fewer than maxWorkers jobs running
    // eslint-disable-next-line no-unmodified-loop-condition
    while (jobCount >= maxWorkers) {
      await new Promise((resolve) => setTimeout(resolve, 50))
    }

    // start a new job in the background and then continue the loop
    ++jobCount
    worker(project._id)
      .then(() => --jobCount)
      .catch(() => {
        console.error('ERROR:', project._id)
        --jobCount
      })
  }
}

if (!module.parent) {
  rearchiveAllDocs()
    .then(() => {
      console.log('Finished!')
      process.exit(0)
    })
    .catch((err) => {
      console.error('Something went wrong:', err)
      process.exit(1)
    })
}