overleaf/services/web/modules/history-migration/app/src/ProjectHistoryController.js

const _ = require('lodash')
const settings = require('@overleaf/settings')
const OError = require('@overleaf/o-error')
const fs = require('fs')
const fse = require('fs-extra')
const { ObjectId } = require('mongodb')
const request = require('request')
const { pipeline } = require('stream')
const unzipper = require('unzipper')
const util = require('util')
const logger = require('@overleaf/logger')
const path = require('path')
const {
  FileTooLargeError,
  InvalidNameError,
} = require('../../../../app/src/Features/Errors/Errors')
const FilestoreHandler = require('../../../../app/src/Features/FileStore/FileStoreHandler')
const ProjectGetter = require('../../../../app/src/Features/Project/ProjectGetter')
const RedisWrapper = require('../../../../app/src/infrastructure/RedisWrapper')
const HistoryManager = require('../../../../app/src/Features/History/HistoryManager')
const ProjectHistoryHandler = require('../../../../app/src/Features/Project/ProjectHistoryHandler')
const ProjectUpdateHandler = require('../../../../app/src/Features/Project/ProjectUpdateHandler')
const DocumentUpdaterHandler = require('../../../../app/src/Features/DocumentUpdater/DocumentUpdaterHandler')
const ProjectEntityHandler = require('../../../../app/src/Features/Project/ProjectEntityHandler')
const ProjectEntityUpdateHandler = require('../../../../app/src/Features/Project/ProjectEntityUpdateHandler')
const SafePath = require('../../../../app/src/Features/Project/SafePath')
const { DeletedFile } = require('../../../../app/src/models/DeletedFile')
const { Doc } = require('../../../../app/src/models/Doc')
const {
  iterablePaths,
} = require('../../../../app/src/Features/Project/IterablePath')

const rclient = RedisWrapper.client('project_history_migration')

module.exports = { deleteProjectHistory, migrateProjectHistory }

/**
 * @typedef {Object} UpdateMeta
 * @property {string | null} user_id the id of the user that performed the update
 * @property {number} ts the timestamp of the update
 */

/**
 * @typedef {UpdateMeta} EditDocUpdateMeta
 * @property {string | null} user_id
 * @property {number} ts
 * @property {string} pathname the doc pathname
 * @property {number} doc_length the length of the doc
 */

/**
 * @typedef {Object} Update
 * @property {string} pathname the path in the file tree
 * @property {UpdateMeta} meta
 // * @property {string} version a two-part version. The first part is the project version after the updates, as recorded in Mongo. The second part is a counter that increments for each update in this batch.
 * @property {string} projectHistoryId the v1 history id for this project
 * @property {number} v
 */

/**
 * @typedef {Update} FileUpdate
 * @property {string} pathname
 * @property {UpdateMeta} meta
 * @property {string} projectHistoryId
 * @property {number} v
 * @property {string} file
 */

/**
 * @typedef {FileUpdate} AddFileUpdate
 * @property {string} pathname
 * @property {UpdateMeta} meta
 * @property {string} projectHistoryId
 * @property {number} v
 * @property {string} file
 * @property {string} url
 */

/**
 * @typedef {Update} DocUpdate
 * @property {UpdateMeta} meta
 * @property {string} projectHistoryId
 * @property {number} v
 * @property {string} doc
 */

/**
 * @typedef {DocUpdate} AddDocUpdate
 * @property {string} pathname
 * @property {UpdateMeta} meta
 * @property {string} projectHistoryId
 * @property {number} v
 * @property {string} doc
 * @property {string} docLines
 * @property {string} docLinesId
 * @property {boolean} contentStored
 */

/**
 * @typedef {DocUpdate} EditDocUpdate
 * @property {EditDocUpdateMeta} meta
 * @property {string} projectHistoryId
 * @property {number} v
 * @property {number} lastV
 * @property {string} doc
 * @property {Array<Object>} op
 */

/**
 * @typedef {AddDocUpdate | AddFileUpdate} AddUpdate
 */

/**
 * @typedef {DocUpdate | FileUpdate} DeleteUpdate
 * @property {string} pathname
 * @property {UpdateMeta} meta
 * @property {string} projectHistoryId
 * @property {number} v
 * @property {string} doc
 * @property {string} new_pathname
 */

/**
 * @typedef {Update} EditDocUpdateStub
 * @property {true} stub
 * @property {string} path
 * @property {string} pathname
 * @property {number} v
 * @property {number} doc_length
 */

/**
 * @typedef {AddUpdate | DeleteUpdate | EditDocUpdate | EditDocUpdateStub } AnyUpdate
 */

/**
 * @typedef {Object} Project
 * @property {string} _id the id of the user that performed the update
 * @property {Object} overleaf
 */

/**
 * @typedef ManifestUpdate
 * @property {string} path
 * @property {number} doc_length
 * @property {number} ts
 * @property {number} version
 */

/**
 * @typedef ManifestContent
 * @property {number} start
 */

/**
 * @typedef ManifestDoc
 * @property {string} id
 * @property {ManifestContent} content
 * @property {Array<ManifestUpdate>} updates
 */

/**
 * @typedef {Object} Manifest
 * @property {string} projectId
 * @property {Array<ManifestDoc>} docs
 */

/**
 * @typedef Entity
 * @property {string} type
 * @property {string} path
 * @property {string} docLines
 * @property {string} deletedAt
 * @property {boolean} deleted
 */

/**
 * Iterate recursively through the folders in project.rootFolder,
 * building a map of all the docs (with content as a docLines string)
 * and files (with content as a filestore URL).
 *
 * @param {Object} project
 * @returns {Promise<Map<string, Entity>>}
 */
async function processRootFolder(project) {
  const entities = new Map()

  async function processFolder(folder, root = '') {
    for (const item of iterablePaths(folder, 'docs')) {
      const doc = await Doc.findOne(
        item._id,
        // only read the fields we need to save memory
        { _id: 1, inS3: 1, lines: 1, name: 1 }
      ).lean()

      // skip malformed doc entries
      if (!doc?._id) {
        logger.warn({ doc }, 'skipping doc with missing id')
        continue
      }
      const id = doc._id.toString()
      const docIsInS3 = !!doc.inS3
      let docLines

      if (docIsInS3) {
        const docPeek = await ProjectEntityHandler.promises.getDoc(
          project._id,
          item._id,
          { peek: true }
        )
        docLines = docPeek.lines
      } else {
        docLines = doc.lines
      }

      if (!docLines) {
        throw new Error(`no doc lines for doc ${id} (inS3: ${docIsInS3})`)
      }

      entities.set(id, {
        path: `${root}/${item.name}`, // NOTE: not doc.name, which is "new doc",
        type: 'doc',
        docLines: docLines.join('\n'),
      })
    }

    for (const item of iterablePaths(folder, 'fileRefs')) {
      const path = `${root}/${item.name}`

      // skip malformed file entries
      if (!item?._id) {
        logger.warn({ item }, 'skipping fileRef with missing id')
        continue
      }
      const id = item._id.toString()

      entities.set(id, {
        path,
        type: 'file',
        url: FilestoreHandler._buildUrl(project._id.toString(), id),
      })
    }

    for (const subfolder of iterablePaths(folder, 'folders')) {
      const path = `${root}/${subfolder.name}`
      await processFolder(subfolder, path)
    }
  }

  for (const folder of project.rootFolder) {
    await processFolder(folder)
  }

  return entities
}

/**
 * Read docs deleted from a project, from the Doc collection,
 * and add them to the entities map with the content in a docLines string.
 *
 * These entities have a `deleted` property set to `true` and a `deletedAt` date.
 *
 * @param {Map<string, Object>} entities
 * @param {string} projectId
 * @returns {Promise<void>}
 */
async function readDeletedDocs(entities, projectId) {
  // NOTE: could call DocstoreManager.promises.getAllDeletedDocs(projectId) instead

  // Look for all docs, since some deleted docs are found in track-changes manifest,
  // but do not have deleted flag set for reasons that are unclear
  // (we will not add docs to entities if they were previously added by processRootFolder)
  const deletedDocsCursor = Doc.find(
    {
      project_id: ObjectId(projectId),
    },
    // only read the fields we need to save memory
    { _id: 1, inS3: 1, lines: 1, name: 1, deletedAt: 1 }
  )
    .lean()
    .cursor()
  for await (const doc of deletedDocsCursor) {
    // skip malformed deleted doc entries
    if (!doc?._id) {
      logger.warn({ doc }, 'skipping deleted doc with missing id')
      continue
    }
    const id = doc._id.toString()
    // Skip doc if we already have an entry in entities
    if (!entities.has(id)) {
      const docIsInS3 = !!doc.inS3
      let docLines

      if (docIsInS3) {
        const docPeek = await ProjectEntityHandler.promises.getDoc(
          ObjectId(projectId),
          doc._id,
          { peek: true }
        )
        docLines = docPeek.lines
      } else {
        docLines = doc.lines
      }

      if (!docLines) {
        throw new Error(`no doc lines for doc ${id} (inS3: ${docIsInS3})`)
      }

      // const ts = Number(
      //   doc.deletedAt ? new Date(doc.deletedAt) : Date.now()
      // )

      if (doc.name && !SafePath.isCleanFilename(doc.name)) {
        const newName = SafePath.clean(doc.name)
        logger.warn(
          { projectId, docId: id, origName: doc.name, newName },
          'renaming invalid deleted file'
        )
        doc.name = newName
      }

      entities.set(id, {
        // NOTE: adding the doc id to the file path to avoid collisions
        path: `/_deleted/${id}/${doc.name}`,
        name: doc.name || 'unnamed', // fallback for improperly deleted docs
        deleted: true,
        type: 'doc',
        deletedAt: doc.deletedAt,
        docLines: docLines.join('\n'),
      })
    }
  }
}

/**
 * Read files deleted from a project, from the DeletedFile collection,
 * and add them to the entities map.
 *
 * These entities have a `deleted` property set to `true` and a `deletedAt` date.
 * The url is built later, from the project id and file id.
 *
 * @param {Map<string, Object>} entities
 * @param {string} projectId
 * @returns {Promise<void>}
 */
async function readDeletedFiles(entities, projectId) {
  const deletedFilesCursor = DeletedFile.find(
    {
      projectId: ObjectId(projectId),
    },
    // only read the fields we need to save memory
    { _id: 1, name: 1, deletedAt: 1 }
  )
    .lean()
    .cursor()

  for await (const file of deletedFilesCursor) {
    // skip malformed deleted file entries
    if (!file?._id) {
      logger.warn({ file }, 'skipping deleted file with missing id')
      continue
    }
    const id = file._id.toString()
    // TODO: check if it already exists?
    if (!entities.has(id)) {
      // const ts = Number(
      //   file.deletedAt ? new Date(file.deletedAt) : Date.now()
      // )

      // TODO: would the hash be useful here?

      if (file.name && !SafePath.isCleanFilename(file.name)) {
        const newName = SafePath.clean(file.name)
        logger.warn(
          { projectId, fileId: id, origName: file.name, newName },
          'renaming invalid deleted file'
        )
        file.name = newName
      }

      entities.set(id, {
        // NOTE: adding the doc id to the file path to avoid collisions
        path: `/_deleted/${id}/${file.name}`,
        name: file.name,
        deleted: true,
        type: 'file',
        deletedAt: file.deletedAt,
      })
    }
  }
}

/**
 * Iterate through the sorted array of updates, pushing each one to Redis.
 *
 * In batches, tell project-history to pull the updates from Redis and process them,
 * so the process fails early if something can't be processed.
 *
 * @param {Array<AnyUpdate>} updates
 * @param {string} projectId
 * @param {string} projectHistoryId
 * @param {Map.<string, Object>} fileMap
 * @returns {Promise<void>}
 */
async function sendUpdatesToProjectHistory(
  updates,
  projectId,
  projectHistoryId,
  fileMap
) {
  let multi = rclient.multi()
  let counter = 0
  let processed = 0
  let size = 0

  const projectHistoryKey =
    settings.redis.project_history_migration.key_schema.projectHistoryOps({
      projectId,
    })

  // clear out anything in the Redis queue for this project's history
  multi.del(projectHistoryKey)

  for (let update of updates) {
    // read the content for each update stub from the archive
    if (update.stub) {
      update = await buildEditDocUpdate(projectHistoryId, update, fileMap)
    }

    // non-edit doc updates need string timestamps, not numbers
    if (!('op' in update)) {
      update.meta.ts = new Date(update.meta.ts).toISOString()
    }

    const updateJSON = JSON.stringify(update)
    multi.rpush(projectHistoryKey, updateJSON)
    counter++
    processed++
    size += updateJSON.length

    // flush the history after every 1000 updates and start a new transaction
    if (counter === 1000) {
      logger.debug(
        { processed, total: updates.length },
        'sending updates to project history'
      )
      // execute the transaction
      await util.promisify(multi.exec)()
      // tell project-history to pull the updates from the Redis queue
      await HistoryManager.promises.flushProject(projectId) // TODO: roll back if this fails?
      counter = 0
      size = 0
      multi = rclient.multi()
    } else if (size > 1024 * 1024) {
      // queue entries in redis more frequently to reduce memory usage
      await util.promisify(multi.exec)()
      size = 0
      multi = rclient.multi()
    }
  }

  if (counter > 0) {
    // execute the transaction
    await util.promisify(multi.exec)()
    // tell project-history to pull the updates from the Redis queue
    await HistoryManager.promises.flushProject(projectId) // TODO: roll back if this fails?
  }

  // return the queue length so we can check that it is empty
  const queueLength = await rclient.llen(projectHistoryKey)
  return queueLength
}

/**
 * Compare two arrays of updates, with the earliest timestamp at the end first.
 *
 * @param {Array<AnyUpdate>} a
 * @param {Array<AnyUpdate>} b
 * @returns {number}
 */
function earliestTimestampFirst(a, b) {
  // both arrays are empty, leave them
  if (!a.length && !b.length) {
    return 0
  }

  // a is empty, move b before a
  if (!a.length) {
    return 1
  }

  // b is empty, don't move b before a
  if (!b.length) {
    return -1
  }

  const tsB = b[b.length - 1].meta.ts
  const tsA = a[a.length - 1].meta.ts
  // if the last item in b has a lower timestamp that the last item in a, move b above a
  if (tsB < tsA) {
    return 1
  }
  if (tsB > tsA) {
    return -1
  }
  // use pathnames as secondary sort key, to make order deterministic for
  // updates with the same timestamp
  const pathnameB = b[b.length - 1].pathname
  const pathnameA = a[a.length - 1].pathname
  if (pathnameB < pathnameA) {
    return 1
  }
  if (pathnameB > pathnameA) {
    return -1
  }
  return 0 // shouldn't happen, because pathnames must be distinct
}

/**
 * Compare two updates, with the highest version number first
 *
 * @param {AnyUpdate} a
 * @param {AnyUpdate} b
 * @returns {number}
 */
function decreasingDocVersion(a, b) {
  if (b.v === a.v) {
    throw new Error(`Matching version: ${b.v} ${a.v}`)
    // return 0
  }
  // if b.v is greater than a.v, sort b above a
  return b.v > a.v ? 1 : -1
}

/**
 * Create an array of queued updates for each doc/file, sorted by version
 *
 * @param {Array<AnyUpdate>} updates
 * @returns {Promise<Array<AnyUpdate>>}
 */
async function sortUpdatesByQueue(updates) {
  // build a queue of updates for each doc/file
  const queues = {}

  for (const update of updates) {
    const docId = update.doc || update.file

    if (!(docId in queues)) {
      queues[docId] = []
    }

    queues[docId].push(update)
  }

  // convert the map to an array of queues
  const values = Object.values(queues)

  for (const queue of values) {
    // sort each queue in place, with each update in decreasing version ofder
    queue.sort(decreasingDocVersion)
  }

  return values
}

/**
 * Fetch all the content and updates for this project from track-changes, as a zip archive.
 *
 * @param {string} projectId
 * @param {string} tempFilePath
 * @returns
 */
async function fetchTrackChangesArchive(projectId, tempFilePath) {
  const writeStream = fs.createWriteStream(tempFilePath)

  const url = `${settings.apis.trackchanges.url}/project/${projectId}/zip`

  // exposed for debugging during full-project-history migration
  const timeout =
    parseInt(process.env.FETCH_TRACK_CHANGES_TIMEOUT, 10) || 2 * 60 * 1000

  try {
    await util.promisify(pipeline)(request(url, { timeout }), writeStream)
  } catch (err) {
    logger.error({ err }, 'Error fetching track changes archive')
    throw err
  }

  const { size } = await fs.promises.stat(tempFilePath)
  logger.info({ projectId, size }, 'fetched zip file from track-changes')
}

/**
 * Open the zip archive and build a Map of each entry in the archive, with the path as the key
 *
 * @param {string} filePath
 * @returns {Promise<Map<string, Object>>}
 */

async function openTrackChangesArchive(filePath) {
  const directory = await unzipper.Open.file(filePath)
  return new Map(directory.files.map(file => [file.path, file]))
}

/**
 * Read the manifest data from the zip archive
 *
 * @param {Map<string, Object>} fileMap
 * @returns {Promise<Manifest>}
 */
async function readTrackChangesManifest(fileMap) {
  const manifestBuffer = await fileMap.get('manifest.json').buffer()

  return JSON.parse(manifestBuffer.toString())
}

/**
 * Check that entities conform to the pathnames allowed by project history
 *
 * @param {Map<string, Object>} entities
 * @param {string} projectId
 */
function validatePaths(entities, projectId) {
  const pathErrors = []
  for (const [id, entity] of entities) {
    if (!SafePath.isCleanPath(entity.path)) {
      pathErrors.push(
        `${entity.type}:${id}${entity.deleted ? ' (deleted)' : ''} path:${
          entity.path
        }`
      )
    }
  }
  if (pathErrors.length) {
    throw new OError('Invalid path in history migration', {
      projectId,
      pathErrors,
    })
  }
}

/**
 * Build an "add" update for an entity, with docLines or url set for the content.
 * This represents a doc or file being added to a project.
 *
 * @param {Object} entity
 * @param {string} entityId
 * @param {string} projectId
 * @param {string} projectHistoryId
 *
 * @returns {AddDocUpdate | AddFileUpdate}
 */
function buildAddUpdate(entity, entityId, projectId, projectHistoryId) {
  const ts = new ObjectId(entityId).getTimestamp()

  const update = {
    pathname: entity.path,
    v: 0, // NOTE: only for sorting
    meta: {
      // source?
      user_id: null, // TODO: assign the update to a system user?
      ts: Number(ts),
      origin: { kind: 'history-migration' },
    },
    projectHistoryId,
  }

  switch (entity.type) {
    case 'doc': {
      return {
        doc: entityId,
        ...update,
        docLines: entity.docLines,
      }
    }

    case 'file': {
      // TODO: set a hash here?
      return {
        // type: 'external',
        file: entityId,
        ...update,
        url: FilestoreHandler._buildUrl(projectId, entityId),
      }
    }

    default:
      throw new Error('Unknown entity type')
  }
}

/**
 * Build a "delete" update for an entity, with new_pathname set to an empty string.
 * This represents a doc or file being deleted from a project.
 *
 * @param {Object} entity
 * @param {string} entityId
 * @param {string} projectId
 * @param {string} projectHistoryId
 * @returns DeleteUpdate
 */
function buildDeleteUpdate(entity, entityId, projectId, projectHistoryId) {
  const ts = entity.deletedAt || new Date()

  const update = {
    pathname: entity.path,
    new_pathname: '', // empty path = deletion
    v: Infinity, // NOTE: only for sorting
    meta: {
      user_id: null, // TODO: assign this to a system user?
      ts: Number(ts),
      origin: { kind: 'history-migration' },
    },
    projectHistoryId,
  }

  switch (entity.type) {
    case 'doc':
      return {
        doc: entityId,
        ...update,
      }

    case 'file':
      return {
        file: entityId,
        ...update,
      }

    default:
      throw new Error(`Unknown entity type ${entity.type}`)
  }
}

/**
 * @typedef TrackedDocUpdateMeta
 * @property {string} user_id
 * @property {number} start_ts
 */

/**
 * @typedef TrackedDocUpdate
 * @property {string} doc_id
 * @property {Array<Object>} op
 * @property {number} v
 * @property {TrackedDocUpdateMeta} meta
 */

/**
 * Build an "edit" update, with op set to an array of operations from track-changes.
 *
 * This represents the contents of a doc being edited in a project.
 *
 * @param {string} projectHistoryId
 * @param {EditDocUpdateStub} updateStub
 * @param {Map.<string, Object>} fileMap
 *
 * @returns {Promise<EditDocUpdate>}
 */
async function buildEditDocUpdate(projectHistoryId, updateStub, fileMap) {
  const buffer = await fileMap.get(updateStub.path).buffer()

  /**
   * @type TrackedDocUpdate
   */
  const data = JSON.parse(buffer.toString())
  let userId = data.meta.user_id
  if (userId === 'anonymous-user' || userId === 'null') {
    userId = null
  }
  if (userId != null && !/^[0-9a-f]{24}$/.test(userId)) {
    throw new OError('Bad user id in ShareLaTeX history edit update', {
      userId,
    })
  }

  return {
    doc: data.doc_id,
    op: data.op, // NOTE: this is an array of operations
    v: data.v,
    lastV: data.v - 1,
    meta: {
      user_id: userId,
      ts: data.meta.start_ts, // TODO: use data.meta.end_ts or update.ts?
      pathname: updateStub.pathname,
      doc_length: updateStub.doc_length,
      origin: { kind: 'history-migration' },
    },
    projectHistoryId,
  }
}

/**
 * Build a stub for an "edit" update, with all the metadata but not the actual operations.
 *
 * This represents a doc being edited in a project, with enough information for sorting,
 * but avoids loading the actual operations from the zip archive until they're needed,
 * so as not to run out of memory if the project's history is large.
 *
 * @param {ManifestUpdate} update
 * @param {Entity} entity
 * @param {string} docId
 * @returns {EditDocUpdateStub}
 */
function buildEditUpdateStub(update, entity, docId) {
  return {
    stub: true,
    doc: docId,
    v: update.version,
    path: update.path,
    pathname: entity.path,
    doc_length: update.doc_length,
    meta: {
      ts: update.ts,
      origin: { kind: 'history-migration' },
    },
  }
}

/**
 * Build the sorted array of updates to be sent to project-history.
 *
 * 1. Process all the added and edited files from the track-changes archive.
 * 2. Process the other files from the project that have been added, and maybe deleted, without any edits.
 *
 * @param {string} projectId
 * @param {string} projectHistoryId
 * @param {Manifest} manifest
 * @param {Map.<string, Entity>} entities
 * @param {Map.<string, Object>} fileMap
 * @returns {Promise<Array<AnyUpdate>>}
 */
async function buildUpdates(
  projectId,
  projectHistoryId,
  manifest,
  entities,
  fileMap
) {
  /**
   * @type Array<AnyUpdate>
   */
  const updates = []

  // keep a list of doc ids which have updates in track-changes
  const updatedDocs = new Set()

  // process the existing docs with updates, from track-changes
  for (const doc of manifest.docs) {
    const entity = entities.get(doc.id)

    if (!entity) {
      throw new Error(`Entity not found for ${doc.id}`)
    }

    if (!entity.path) {
      throw new Error(`Path not found for ${doc.id}`)
    }

    // add the initial content
    const contentStart = doc.content.start

    const buffer = await fileMap.get(contentStart.path).buffer()

    /**
     * @type AddDocUpdate
     */
    const update = {
      doc: doc.id,
      pathname: entity.path,
      v: contentStart.version - 1,
      meta: {
        user_id: null, // TODO: assign this to a system user?
        ts: Number(ObjectId(doc.id).getTimestamp()),
        origin: { kind: 'history-migration' },
      },
      projectHistoryId,
      docLines: buffer.toString(),
    }

    updates.push(update)

    // push the update onto the array of updates
    for (const update of doc.updates) {
      updates.push(buildEditUpdateStub(update, entity, doc.id))
    }

    updatedDocs.add(doc.id)
  }

  // process the docs which have been added/deleted without any updates being recorded
  for (const [id, entity] of entities.entries()) {
    if (entity.deleted) {
      // deleted entity

      // add the doc/file
      if (!updatedDocs.has(id)) {
        updates.push(buildAddUpdate(entity, id, projectId, projectHistoryId))
      }

      // delete the doc/file again (there may be updates added between adding and deleting)
      updates.push(buildDeleteUpdate(entity, id, projectId, projectHistoryId))
    } else {
      if (!updatedDocs.has(id)) {
        // add "not deleted" doc that isn't in the manifest either
        updates.push(buildAddUpdate(entity, id, projectId, projectHistoryId))
      }
    }
  }

  return updates
}

/**
 * Remove the `overleaf.history` object from the project and tell project-history to delete everything for this project.
 * (note: project-history may not delete the actual history data yet, but it will at least delete the cached history id)
 *
 * @param {string} projectId
 * @returns {Promise<void>}
 */
async function deleteProjectHistory(projectId) {
  await HistoryManager.promises.deleteProjectHistory(projectId)
  // TODO: send a message to document-updater?
  await ProjectHistoryHandler.unsetHistory(projectId)
}

/**
 * Send the updates from the track changes zip file to project history
 *
 * @param {string} projectId
 * @param {string} projectHistoryId
 * @param {Array<AnyUpdate>} updates
 * @param {Map.<string, Object>} fileMap
 */
async function migrateTrackChangesUpdates(
  projectId,
  projectHistoryId,
  updates,
  fileMap
) {
  // Build a queue for each doc, sorted by version (and by timestamp within each version)
  const queues = await sortUpdatesByQueue(updates)

  const sortedUpdates = []

  let item
  do {
    // Find the earliest item from the tail of all queues
    queues.sort(earliestTimestampFirst)
    item = queues[0].pop()
    if (item) {
      sortedUpdates.push(item)
    }
  } while (item)

  // NOTE: leaving the version string code commented out, in case it ends up being needed
  // let majorVersion = 0
  // let minorVersion = 0
  for (const update of sortedUpdates) {
    // increment majorVersion if this is a file change
    if (!('op' in update)) {
      // remove v (only used for sorting)
      delete update.v

      // set version
      // majorVersion++
      // // minorVersion = 0
      // update.version = `${majorVersion}.${minorVersion}` // NOTE: not set as project-history doesn't need it and could cause problems if it gets higher than project.version
    }
    // increment minorVersion after every update
    // minorVersion++
  }

  // add each update to the Redis queue for project-history to process
  logger.debug(
    { projectId, projectHistoryId },
    'Sending updates for project to Redis'
  )

  const remainingQueueLength = await sendUpdatesToProjectHistory(
    sortedUpdates,
    projectId,
    projectHistoryId,
    fileMap
  )
  // Failure will cause queued updates to be deleted (in the catch below)

  logger.debug(
    {
      projectId,
      projectHistoryId,
      remainingQueueLength,
    },
    'Updates sent to project-history'
  )

  if (remainingQueueLength > 0) {
    throw new Error('flush to project-history did not complete')
  }

  // TODO: roll back if any of the following fail?

  // TODO: check that the Redis queue is empty?

  // Clear any old entries in the main project history queue (these will not
  // have a history id)
  await HistoryManager.promises.flushProject(projectId)
}

/**
 * Add the zip file from track changes to the project file tree.
 * We may be able to recover a failed history from the zip file in future.
 *
 * @param {string} projectId
 * @param {string} rootFolderId
 * @param {string} tempFilePath
 */

async function uploadTrackChangesArchiveToProject(
  projectId,
  rootFolderId,
  tempFilePath
) {
  const { size } = await fs.promises.stat(tempFilePath)
  if (size > settings.maxUploadSize) {
    throw new FileTooLargeError({
      message: 'track-changes archive exceeds maximum size for archiving',
      info: { size },
    })
  }
  const { fileRef } = await ProjectEntityUpdateHandler.promises.addFile(
    projectId,
    rootFolderId, // project.rootFolder[0]._id,
    `OverleafHistory-${new Date().toISOString().substring(0, 10)}.zip`,
    tempFilePath,
    null,
    null, // no owner
    null // no source
  )
  logger.debug(
    { projectId, fileRef },
    'Uploaded track-changes zip archive to project due to error in migration'
  )
}

/**
 * Check all updates for invalid characters (nonBMP or null) and substitute
 * the unicode replacement character if options.fixInvalidCharacters is true,
 * otherwise throw an exception.
 * @param {Array<AnyUpdate>} updates
 * @param {string} projectId
 * @param {Object} options
 */
function validateUpdates(updates, projectId, options) {
  const replace = options.fixInvalidCharacters
  // check for invalid characters
  function containsBadChars(str) {
    return /[\uD800-\uDBFF]/.test(str) || str.indexOf('\x00') !== -1
  }
  // Replace invalid characters so that they will be accepted by history_v1.
  function sanitise(str) {
    if (replace) {
      return str.replace(/[\uD800-\uDFFF]/g, '\uFFFD').replace('\x00', '\uFFFD')
    } else {
      throw new Error('invalid character in content')
    }
  }
  // Check size of doclines in update against max size allowed by history_v1.
  // This catches docs which are too large when created, but not when they
  // go over the limit due to edits.
  function checkSize(update) {
    if (update?.docLines?.length > settings.max_doc_length) {
      throw new FileTooLargeError({
        message: 'docLines exceeds maximum size for history',
        info: { docId: update.doc, size: update.docLines.length },
      })
    }
  }
  let latestTimestamp = 0
  // Iterate over the all the updates and their doclines or ops
  for (const update of updates) {
    checkSize(update)
    // Find the timestamp of the most recent edit (either adding a doc or editing a doc)
    // we exclude deletions as these are created in the migration and we didn't record
    // the deletion time for older files.
    const isDeleteUpdate = update.new_pathname === ''
    if (
      update.doc &&
      !isDeleteUpdate &&
      update.meta.ts &&
      update.meta.ts > latestTimestamp
    ) {
      latestTimestamp = update.meta.ts
    }
    if (update.docLines && containsBadChars(update.docLines)) {
      logger.debug({ update, replace }, 'invalid character in docLines')
      update.docLines = sanitise(update.docLines)
    }
    if (update.op) {
      for (const op of update.op) {
        if (op.i && containsBadChars(op.i)) {
          logger.debug({ update, replace }, 'invalid character in insert op')
          op.i = sanitise(op.i)
        }
        if (op.d && containsBadChars(op.d)) {
          logger.debug({ update, replace }, 'invalid character in delete op')
          op.d = sanitise(op.d)
        }
      }
    }
  }
  logger.debug(
    { projectId, latestTimestamp, date: new Date(latestTimestamp) },
    'timestamp of most recent edit'
  )
  if (options.cutoffDate && new Date(latestTimestamp) > options.cutoffDate) {
    throw new Error('project was edited after cutoff date')
  }
}

/**
 * Migrate a project's history from track-changes to project-history
 *
 * @param {string} projectId
 *
 * @returns {Promise<void>}
 */
async function migrateProjectHistory(projectId, options = {}) {
  await fse.ensureDir(settings.path.projectHistories)
  const projectHistoriesDir = await fs.promises.realpath(
    settings.path.projectHistories
  )
  const tempDir = await fs.promises.mkdtemp(projectHistoriesDir + path.sep)
  const tempFilePath = path.join(tempDir, 'project.zip')

  try {
    // fetch the zip archive of rewound content and updates from track-changes
    // store the zip archive to disk, open it and build a Map of the entries
    if (options.importZipFilePath) {
      // use an existing track-changes archive on disk
      logger.debug(
        { src: options.importZipFilePath, dst: tempFilePath },
        'importing zip file'
      )
      await fs.promises.copyFile(options.importZipFilePath, tempFilePath)
      const { size } = await fs.promises.stat(tempFilePath)
      logger.info({ projectId, size }, 'imported zip file from disk')
    } else {
      await fetchTrackChangesArchive(projectId, tempFilePath)
    }
    const fileMap = await openTrackChangesArchive(tempFilePath)

    // read the manifest from the zip archive
    const manifest = await readTrackChangesManifest(fileMap)

    // check that the project id in the manifest matches
    // to be sure we are using the correct zip file
    if (manifest.projectId !== projectId) {
      throw new Error(`Incorrect projectId: ${manifest.projectId}`)
    }

    // load the Project from MongoDB
    const project = await ProjectGetter.promises.getProject(projectId)

    // create a history id for this project
    const oldProjectHistoryId = _.get(project, 'overleaf.history.id')

    // throw an error if there is already a history associated with the project
    if (oldProjectHistoryId) {
      throw new Error(
        `Project ${projectId} already has history ${oldProjectHistoryId}`
      )
    }

    try {
      // initialize a new project history and use the history id
      // NOTE: not setting the history id on the project yet
      const projectHistoryId = await HistoryManager.promises.initializeProject(
        projectId
      )

      try {
        // build a Map of the entities (docs and fileRefs) currently in the project,
        // with _id as the key
        const entities = await processRootFolder(project)

        // find all the deleted docs for this project and add them to the entity map
        await readDeletedDocs(entities, projectId)

        // find all the deleted files for this project and add them to the entity map
        await readDeletedFiles(entities, projectId)

        // check that the paths will not be rejected
        validatePaths(entities, projectId)

        // build the array of updates that make up the new history for this project
        const updates = await buildUpdates(
          projectId,
          projectHistoryId,
          manifest,
          entities,
          fileMap
        )

        // check that the updates don't contain any characters that will be rejected by history_v1.
        validateUpdates(updates, projectId, options)

        if (updates.length) {
          await migrateTrackChangesUpdates(
            projectId,
            projectHistoryId,
            updates,
            fileMap
          )
        }
      } catch (error) {
        if (options?.archiveOnFailure) {
          // on error, optionally store the zip file in the project for future reference
          logger.debug(
            { projectId, error },
            'Error sending track-changes updates to project history, attempting to archive zip file in project'
          )
          try {
            await uploadTrackChangesArchiveToProject(
              projectId,
              project.rootFolder[0]._id,
              tempFilePath
            )
          } catch (error) {
            if (error instanceof InvalidNameError) {
              logger.info({ projectId }, 'zip file already archived in project')
            } else {
              throw error
            }
          } finally {
            // roll back the last updated timestamp and user
            logger.debug(
              { projectId },
              'rolling back last updated time after uploading zip file'
            )
            await ProjectUpdateHandler.promises.resetUpdated(
              projectId,
              project.lastUpdated,
              project.lastUpdatedBy
            )
          }
          // set the overleaf.history.zipFileArchivedInProject flag for future reference
          await ProjectHistoryHandler.promises.setMigrationArchiveFlag(
            projectId
          )
          // we consider archiving the zip file as "success" (at least we've given up on attempting
          // to migrate the history) so we don't rethrow the error and continue to initialise the new
          // empty history below.
        } else {
          // if we're not archiving the zip file then we rethrown the error to fail the migration
          throw error
        }
      }

      // set the project's history id once the updates have been successfully processed
      // (or we have given up and archived the zip file in the project).
      logger.debug(
        { projectId, projectHistoryId },
        'Setting history id on project'
      )
      await ProjectHistoryHandler.promises.setHistoryId(
        projectId,
        projectHistoryId
      )

      try {
        // tell document updater to reload docs with the new history id
        logger.debug({ projectId }, 'Asking document-updater to clear project')
        await DocumentUpdaterHandler.promises.flushProjectToMongoAndDelete(
          projectId
        )

        // run a project history resync in case any changes have arrived since the migration
        logger.debug(
          { projectId },
          'Asking project-history to force resync project'
        )

        await HistoryManager.promises.resyncProject(projectId, {
          force: true,
          origin: { kind: 'history-migration' },
        })
      } catch (error) {
        if (options.forceNewHistoryOnFailure) {
          logger.warn(
            { projectId },
            'failed to resync project, forcing new history'
          )
        } else {
          throw error
        }
      }
      // set the display to v2 history
      logger.debug(
        { projectId },
        'Switching on full project history display for project'
      )
      await ProjectHistoryHandler.promises.upgradeHistory(projectId, true)
    } catch (error) {
      // delete the history id again if something failed?
      logger.warn(
        OError.tag(
          error,
          'Something went wrong flushing and resyncing project; clearing full project history for project',
          { projectId }
        )
      )
      await deleteProjectHistory(projectId)

      throw error
    }
  } finally {
    // clean up the temporary directory
    await fse.remove(tempDir)
  }
}