mirror of
https://github.com/overleaf/overleaf.git
synced 2024-12-12 03:45:26 -05:00
6787e9c50d
[web/scripts] `history/migrate_history.js` fixes GitOrigin-RevId: 249e9a3f1dbf89d46335ee208f5922905477845c
1321 lines
38 KiB
JavaScript
1321 lines
38 KiB
JavaScript
const _ = require('lodash')
|
|
const settings = require('@overleaf/settings')
|
|
const OError = require('@overleaf/o-error')
|
|
const fs = require('fs')
|
|
const fse = require('fs-extra')
|
|
const { ObjectId } = require('mongodb')
|
|
const request = require('request')
|
|
const { pipeline } = require('stream')
|
|
const unzipper = require('unzipper')
|
|
const util = require('util')
|
|
const logger = require('@overleaf/logger')
|
|
const path = require('path')
|
|
const {
|
|
FileTooLargeError,
|
|
InvalidNameError,
|
|
} = require('../../../../app/src/Features/Errors/Errors')
|
|
const FilestoreHandler = require('../../../../app/src/Features/FileStore/FileStoreHandler')
|
|
const ProjectGetter = require('../../../../app/src/Features/Project/ProjectGetter')
|
|
const RedisWrapper = require('../../../../app/src/infrastructure/RedisWrapper')
|
|
const HistoryManager = require('../../../../app/src/Features/History/HistoryManager')
|
|
const ProjectHistoryHandler = require('../../../../app/src/Features/Project/ProjectHistoryHandler')
|
|
const ProjectUpdateHandler = require('../../../../app/src/Features/Project/ProjectUpdateHandler')
|
|
const DocumentUpdaterHandler = require('../../../../app/src/Features/DocumentUpdater/DocumentUpdaterHandler')
|
|
const ProjectEntityHandler = require('../../../../app/src/Features/Project/ProjectEntityHandler')
|
|
const ProjectEntityUpdateHandler = require('../../../../app/src/Features/Project/ProjectEntityUpdateHandler')
|
|
const SafePath = require('../../../../app/src/Features/Project/SafePath')
|
|
const { DeletedFile } = require('../../../../app/src/models/DeletedFile')
|
|
const { Doc } = require('../../../../app/src/models/Doc')
|
|
const {
|
|
iterablePaths,
|
|
} = require('../../../../app/src/Features/Project/IterablePath')
|
|
|
|
const rclient = RedisWrapper.client('project_history_migration')
|
|
|
|
module.exports = { deleteProjectHistory, migrateProjectHistory }
|
|
|
|
/**
|
|
* @typedef {Object} UpdateMeta
|
|
* @property {string | null} user_id the id of the user that performed the update
|
|
* @property {number} ts the timestamp of the update
|
|
*/
|
|
|
|
/**
|
|
* @typedef {UpdateMeta} EditDocUpdateMeta
|
|
* @property {string | null} user_id
|
|
* @property {number} ts
|
|
* @property {string} pathname the doc pathname
|
|
* @property {number} doc_length the length of the doc
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Object} Update
|
|
* @property {string} pathname the path in the file tree
|
|
* @property {UpdateMeta} meta
|
|
// * @property {string} version a two-part version. The first part is the project version after the updates, as recorded in Mongo. The second part is a counter that increments for each update in this batch.
|
|
* @property {string} projectHistoryId the v1 history id for this project
|
|
* @property {number} v
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Update} FileUpdate
|
|
* @property {string} pathname
|
|
* @property {UpdateMeta} meta
|
|
* @property {string} projectHistoryId
|
|
* @property {number} v
|
|
* @property {string} file
|
|
*/
|
|
|
|
/**
|
|
* @typedef {FileUpdate} AddFileUpdate
|
|
* @property {string} pathname
|
|
* @property {UpdateMeta} meta
|
|
* @property {string} projectHistoryId
|
|
* @property {number} v
|
|
* @property {string} file
|
|
* @property {string} url
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Update} DocUpdate
|
|
* @property {UpdateMeta} meta
|
|
* @property {string} projectHistoryId
|
|
* @property {number} v
|
|
* @property {string} doc
|
|
*/
|
|
|
|
/**
|
|
* @typedef {DocUpdate} AddDocUpdate
|
|
* @property {string} pathname
|
|
* @property {UpdateMeta} meta
|
|
* @property {string} projectHistoryId
|
|
* @property {number} v
|
|
* @property {string} doc
|
|
* @property {string} docLines
|
|
* @property {string} docLinesId
|
|
* @property {boolean} contentStored
|
|
*/
|
|
|
|
/**
|
|
* @typedef {DocUpdate} EditDocUpdate
|
|
* @property {EditDocUpdateMeta} meta
|
|
* @property {string} projectHistoryId
|
|
* @property {number} v
|
|
* @property {number} lastV
|
|
* @property {string} doc
|
|
* @property {Array<Object>} op
|
|
*/
|
|
|
|
/**
|
|
* @typedef {AddDocUpdate | AddFileUpdate} AddUpdate
|
|
*/
|
|
|
|
/**
|
|
* @typedef {DocUpdate | FileUpdate} DeleteUpdate
|
|
* @property {string} pathname
|
|
* @property {UpdateMeta} meta
|
|
* @property {string} projectHistoryId
|
|
* @property {number} v
|
|
* @property {string} doc
|
|
* @property {string} new_pathname
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Update} EditDocUpdateStub
|
|
* @property {true} stub
|
|
* @property {string} path
|
|
* @property {string} pathname
|
|
* @property {number} v
|
|
* @property {number} doc_length
|
|
*/
|
|
|
|
/**
|
|
* @typedef {AddUpdate | DeleteUpdate | EditDocUpdate | EditDocUpdateStub } AnyUpdate
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Object} Project
|
|
* @property {string} _id the id of the user that performed the update
|
|
* @property {Object} overleaf
|
|
*/
|
|
|
|
/**
|
|
* @typedef ManifestUpdate
|
|
* @property {string} path
|
|
* @property {number} doc_length
|
|
* @property {number} ts
|
|
* @property {number} version
|
|
*/
|
|
|
|
/**
|
|
* @typedef ManifestContent
|
|
* @property {number} start
|
|
*/
|
|
|
|
/**
|
|
* @typedef ManifestDoc
|
|
* @property {string} id
|
|
* @property {ManifestContent} content
|
|
* @property {Array<ManifestUpdate>} updates
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Object} Manifest
|
|
* @property {string} projectId
|
|
* @property {Array<ManifestDoc>} docs
|
|
*/
|
|
|
|
/**
|
|
* @typedef Entity
|
|
* @property {string} type
|
|
* @property {string} path
|
|
* @property {string} docLines
|
|
* @property {string} deletedAt
|
|
* @property {boolean} deleted
|
|
*/
|
|
|
|
/**
|
|
* Iterate recursively through the folders in project.rootFolder,
|
|
* building a map of all the docs (with content as a docLines string)
|
|
* and files (with content as a filestore URL).
|
|
*
|
|
* @param {Object} project
|
|
* @returns {Promise<Map<string, Entity>>}
|
|
*/
|
|
async function processRootFolder(project) {
|
|
const entities = new Map()
|
|
|
|
async function processFolder(folder, root = '') {
|
|
for (const item of iterablePaths(folder, 'docs')) {
|
|
const doc = await Doc.findOne(
|
|
item._id,
|
|
// only read the fields we need to save memory
|
|
{ _id: 1, inS3: 1, lines: 1, name: 1 }
|
|
).lean()
|
|
|
|
// skip malformed doc entries
|
|
if (!doc?._id) {
|
|
logger.warn({ doc }, 'skipping doc with missing id')
|
|
continue
|
|
}
|
|
const id = doc._id.toString()
|
|
const docIsInS3 = !!doc.inS3
|
|
let docLines
|
|
|
|
if (docIsInS3) {
|
|
const docPeek = await ProjectEntityHandler.promises.getDoc(
|
|
project._id,
|
|
item._id,
|
|
{ peek: true }
|
|
)
|
|
docLines = docPeek.lines
|
|
} else {
|
|
docLines = doc.lines
|
|
}
|
|
|
|
if (!docLines) {
|
|
throw new Error(`no doc lines for doc ${id} (inS3: ${docIsInS3})`)
|
|
}
|
|
|
|
entities.set(id, {
|
|
path: `${root}/${item.name}`, // NOTE: not doc.name, which is "new doc",
|
|
type: 'doc',
|
|
docLines: docLines.join('\n'),
|
|
})
|
|
}
|
|
|
|
for (const item of iterablePaths(folder, 'fileRefs')) {
|
|
const path = `${root}/${item.name}`
|
|
|
|
// skip malformed file entries
|
|
if (!item?._id) {
|
|
logger.warn({ item }, 'skipping fileRef with missing id')
|
|
continue
|
|
}
|
|
const id = item._id.toString()
|
|
|
|
entities.set(id, {
|
|
path,
|
|
type: 'file',
|
|
url: FilestoreHandler._buildUrl(project._id.toString(), id),
|
|
})
|
|
}
|
|
|
|
for (const subfolder of iterablePaths(folder, 'folders')) {
|
|
const path = `${root}/${subfolder.name}`
|
|
await processFolder(subfolder, path)
|
|
}
|
|
}
|
|
|
|
for (const folder of project.rootFolder) {
|
|
await processFolder(folder)
|
|
}
|
|
|
|
return entities
|
|
}
|
|
|
|
/**
|
|
* Read docs deleted from a project, from the Doc collection,
|
|
* and add them to the entities map with the content in a docLines string.
|
|
*
|
|
* These entities have a `deleted` property set to `true` and a `deletedAt` date.
|
|
*
|
|
* @param {Map<string, Object>} entities
|
|
* @param {string} projectId
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async function readDeletedDocs(entities, projectId) {
|
|
// NOTE: could call DocstoreManager.promises.getAllDeletedDocs(projectId) instead
|
|
|
|
// Look for all docs, since some deleted docs are found in track-changes manifest,
|
|
// but do not have deleted flag set for reasons that are unclear
|
|
// (we will not add docs to entities if they were previously added by processRootFolder)
|
|
const deletedDocsCursor = Doc.find(
|
|
{
|
|
project_id: ObjectId(projectId),
|
|
},
|
|
// only read the fields we need to save memory
|
|
{ _id: 1, inS3: 1, lines: 1, name: 1, deletedAt: 1 }
|
|
)
|
|
.lean()
|
|
.cursor()
|
|
for await (const doc of deletedDocsCursor) {
|
|
// skip malformed deleted doc entries
|
|
if (!doc?._id) {
|
|
logger.warn({ doc }, 'skipping deleted doc with missing id')
|
|
continue
|
|
}
|
|
const id = doc._id.toString()
|
|
// Skip doc if we already have an entry in entities
|
|
if (!entities.has(id)) {
|
|
const docIsInS3 = !!doc.inS3
|
|
let docLines
|
|
|
|
if (docIsInS3) {
|
|
const docPeek = await ProjectEntityHandler.promises.getDoc(
|
|
ObjectId(projectId),
|
|
doc._id,
|
|
{ peek: true }
|
|
)
|
|
docLines = docPeek.lines
|
|
} else {
|
|
docLines = doc.lines
|
|
}
|
|
|
|
if (!docLines) {
|
|
throw new Error(`no doc lines for doc ${id} (inS3: ${docIsInS3})`)
|
|
}
|
|
|
|
// const ts = Number(
|
|
// doc.deletedAt ? new Date(doc.deletedAt) : Date.now()
|
|
// )
|
|
|
|
if (doc.name && !SafePath.isCleanFilename(doc.name)) {
|
|
const newName = SafePath.clean(doc.name)
|
|
logger.warn(
|
|
{ projectId, docId: id, origName: doc.name, newName },
|
|
'renaming invalid deleted file'
|
|
)
|
|
doc.name = newName
|
|
}
|
|
|
|
entities.set(id, {
|
|
// NOTE: adding the doc id to the file path to avoid collisions
|
|
path: `/_deleted/${id}/${doc.name}`,
|
|
name: doc.name || 'unnamed', // fallback for improperly deleted docs
|
|
deleted: true,
|
|
type: 'doc',
|
|
deletedAt: doc.deletedAt,
|
|
docLines: docLines.join('\n'),
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Read files deleted from a project, from the DeletedFile collection,
|
|
* and add them to the entities map.
|
|
*
|
|
* These entities have a `deleted` property set to `true` and a `deletedAt` date.
|
|
* The url is built later, from the project id and file id.
|
|
*
|
|
* @param {Map<string, Object>} entities
|
|
* @param {string} projectId
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async function readDeletedFiles(entities, projectId) {
|
|
const deletedFilesCursor = DeletedFile.find(
|
|
{
|
|
projectId: ObjectId(projectId),
|
|
},
|
|
// only read the fields we need to save memory
|
|
{ _id: 1, name: 1, deletedAt: 1 }
|
|
)
|
|
.lean()
|
|
.cursor()
|
|
|
|
for await (const file of deletedFilesCursor) {
|
|
// skip malformed deleted file entries
|
|
if (!file?._id) {
|
|
logger.warn({ file }, 'skipping deleted file with missing id')
|
|
continue
|
|
}
|
|
const id = file._id.toString()
|
|
// TODO: check if it already exists?
|
|
if (!entities.has(id)) {
|
|
// const ts = Number(
|
|
// file.deletedAt ? new Date(file.deletedAt) : Date.now()
|
|
// )
|
|
|
|
// TODO: would the hash be useful here?
|
|
|
|
if (file.name && !SafePath.isCleanFilename(file.name)) {
|
|
const newName = SafePath.clean(file.name)
|
|
logger.warn(
|
|
{ projectId, fileId: id, origName: file.name, newName },
|
|
'renaming invalid deleted file'
|
|
)
|
|
file.name = newName
|
|
}
|
|
|
|
entities.set(id, {
|
|
// NOTE: adding the doc id to the file path to avoid collisions
|
|
path: `/_deleted/${id}/${file.name}`,
|
|
name: file.name,
|
|
deleted: true,
|
|
type: 'file',
|
|
deletedAt: file.deletedAt,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Iterate through the sorted array of updates, pushing each one to Redis.
|
|
*
|
|
* In batches, tell project-history to pull the updates from Redis and process them,
|
|
* so the process fails early if something can't be processed.
|
|
*
|
|
* @param {Array<AnyUpdate>} updates
|
|
* @param {string} projectId
|
|
* @param {string} projectHistoryId
|
|
* @param {Map.<string, Object>} fileMap
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async function sendUpdatesToProjectHistory(
|
|
updates,
|
|
projectId,
|
|
projectHistoryId,
|
|
fileMap
|
|
) {
|
|
let multi = rclient.multi()
|
|
let counter = 0
|
|
let processed = 0
|
|
let size = 0
|
|
|
|
const projectHistoryKey =
|
|
settings.redis.project_history_migration.key_schema.projectHistoryOps({
|
|
projectId,
|
|
})
|
|
|
|
// clear out anything in the Redis queue for this project's history
|
|
multi.del(projectHistoryKey)
|
|
|
|
for (let update of updates) {
|
|
// read the content for each update stub from the archive
|
|
if (update.stub) {
|
|
update = await buildEditDocUpdate(projectHistoryId, update, fileMap)
|
|
}
|
|
|
|
// non-edit doc updates need string timestamps, not numbers
|
|
if (!('op' in update)) {
|
|
update.meta.ts = new Date(update.meta.ts).toISOString()
|
|
}
|
|
|
|
const updateJSON = JSON.stringify(update)
|
|
multi.rpush(projectHistoryKey, updateJSON)
|
|
counter++
|
|
processed++
|
|
size += updateJSON.length
|
|
|
|
// flush the history after every 1000 updates and start a new transaction
|
|
if (counter === 1000) {
|
|
logger.debug(
|
|
{ processed, total: updates.length },
|
|
'sending updates to project history'
|
|
)
|
|
// execute the transaction
|
|
await util.promisify(multi.exec)()
|
|
// tell project-history to pull the updates from the Redis queue
|
|
await HistoryManager.promises.flushProject(projectId) // TODO: roll back if this fails?
|
|
counter = 0
|
|
size = 0
|
|
multi = rclient.multi()
|
|
} else if (size > 1024 * 1024) {
|
|
// queue entries in redis more frequently to reduce memory usage
|
|
await util.promisify(multi.exec)()
|
|
size = 0
|
|
multi = rclient.multi()
|
|
}
|
|
}
|
|
|
|
if (counter > 0) {
|
|
// execute the transaction
|
|
await util.promisify(multi.exec)()
|
|
// tell project-history to pull the updates from the Redis queue
|
|
await HistoryManager.promises.flushProject(projectId) // TODO: roll back if this fails?
|
|
}
|
|
|
|
// return the queue length so we can check that it is empty
|
|
const queueLength = await rclient.llen(projectHistoryKey)
|
|
return queueLength
|
|
}
|
|
|
|
/**
|
|
* Compare two arrays of updates, with the earliest timestamp at the end first.
|
|
*
|
|
* @param {Array<AnyUpdate>} a
|
|
* @param {Array<AnyUpdate>} b
|
|
* @returns {number}
|
|
*/
|
|
function earliestTimestampFirst(a, b) {
|
|
// both arrays are empty, leave them
|
|
if (!a.length && !b.length) {
|
|
return 0
|
|
}
|
|
|
|
// a is empty, move b before a
|
|
if (!a.length) {
|
|
return 1
|
|
}
|
|
|
|
// b is empty, don't move b before a
|
|
if (!b.length) {
|
|
return -1
|
|
}
|
|
|
|
const tsB = b[b.length - 1].meta.ts
|
|
const tsA = a[a.length - 1].meta.ts
|
|
// if the last item in b has a lower timestamp that the last item in a, move b above a
|
|
if (tsB < tsA) {
|
|
return 1
|
|
}
|
|
if (tsB > tsA) {
|
|
return -1
|
|
}
|
|
// use pathnames as secondary sort key, to make order deterministic for
|
|
// updates with the same timestamp
|
|
const pathnameB = b[b.length - 1].pathname
|
|
const pathnameA = a[a.length - 1].pathname
|
|
if (pathnameB < pathnameA) {
|
|
return 1
|
|
}
|
|
if (pathnameB > pathnameA) {
|
|
return -1
|
|
}
|
|
return 0 // shouldn't happen, because pathnames must be distinct
|
|
}
|
|
|
|
/**
|
|
* Compare two updates, with the highest version number first
|
|
*
|
|
* @param {AnyUpdate} a
|
|
* @param {AnyUpdate} b
|
|
* @returns {number}
|
|
*/
|
|
function decreasingDocVersion(a, b) {
|
|
if (b.v === a.v) {
|
|
throw new Error(`Matching version: ${b.v} ${a.v}`)
|
|
// return 0
|
|
}
|
|
// if b.v is greater than a.v, sort b above a
|
|
return b.v > a.v ? 1 : -1
|
|
}
|
|
|
|
/**
|
|
* Create an array of queued updates for each doc/file, sorted by version
|
|
*
|
|
* @param {Array<AnyUpdate>} updates
|
|
* @returns {Promise<Array<AnyUpdate>>}
|
|
*/
|
|
async function sortUpdatesByQueue(updates) {
|
|
// build a queue of updates for each doc/file
|
|
const queues = {}
|
|
|
|
for (const update of updates) {
|
|
const docId = update.doc || update.file
|
|
|
|
if (!(docId in queues)) {
|
|
queues[docId] = []
|
|
}
|
|
|
|
queues[docId].push(update)
|
|
}
|
|
|
|
// convert the map to an array of queues
|
|
const values = Object.values(queues)
|
|
|
|
for (const queue of values) {
|
|
// sort each queue in place, with each update in decreasing version ofder
|
|
queue.sort(decreasingDocVersion)
|
|
}
|
|
|
|
return values
|
|
}
|
|
|
|
/**
|
|
* Fetch all the content and updates for this project from track-changes, as a zip archive.
|
|
*
|
|
* @param {string} projectId
|
|
* @param {string} tempFilePath
|
|
* @returns
|
|
*/
|
|
async function fetchTrackChangesArchive(projectId, tempFilePath) {
|
|
const writeStream = fs.createWriteStream(tempFilePath)
|
|
|
|
const url = `${settings.apis.trackchanges.url}/project/${projectId}/zip`
|
|
|
|
// exposed for debugging during full-project-history migration
|
|
const timeout =
|
|
parseInt(process.env.FETCH_TRACK_CHANGES_TIMEOUT, 10) || 2 * 60 * 1000
|
|
|
|
try {
|
|
await util.promisify(pipeline)(request(url, { timeout }), writeStream)
|
|
} catch (err) {
|
|
logger.error({ err }, 'Error fetching track changes archive')
|
|
throw err
|
|
}
|
|
|
|
const { size } = await fs.promises.stat(tempFilePath)
|
|
logger.info({ projectId, size }, 'fetched zip file from track-changes')
|
|
}
|
|
|
|
/**
|
|
* Open the zip archive and build a Map of each entry in the archive, with the path as the key
|
|
*
|
|
* @param {string} filePath
|
|
* @returns {Promise<Map<string, Object>>}
|
|
*/
|
|
|
|
async function openTrackChangesArchive(filePath) {
|
|
const directory = await unzipper.Open.file(filePath)
|
|
return new Map(directory.files.map(file => [file.path, file]))
|
|
}
|
|
|
|
/**
|
|
* Read the manifest data from the zip archive
|
|
*
|
|
* @param {Map<string, Object>} fileMap
|
|
* @returns {Promise<Manifest>}
|
|
*/
|
|
async function readTrackChangesManifest(fileMap) {
|
|
const manifestBuffer = await fileMap.get('manifest.json').buffer()
|
|
|
|
return JSON.parse(manifestBuffer.toString())
|
|
}
|
|
|
|
/**
|
|
* Check that entities conform to the pathnames allowed by project history
|
|
*
|
|
* @param {Map<string, Object>} entities
|
|
* @param {string} projectId
|
|
*/
|
|
function validatePaths(entities, projectId) {
|
|
const pathErrors = []
|
|
for (const [id, entity] of entities) {
|
|
if (!SafePath.isCleanPath(entity.path)) {
|
|
pathErrors.push(
|
|
`${entity.type}:${id}${entity.deleted ? ' (deleted)' : ''} path:${
|
|
entity.path
|
|
}`
|
|
)
|
|
}
|
|
}
|
|
if (pathErrors.length) {
|
|
throw new OError('Invalid path in history migration', {
|
|
projectId,
|
|
pathErrors,
|
|
})
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Build an "add" update for an entity, with docLines or url set for the content.
|
|
* This represents a doc or file being added to a project.
|
|
*
|
|
* @param {Object} entity
|
|
* @param {string} entityId
|
|
* @param {string} projectId
|
|
* @param {string} projectHistoryId
|
|
*
|
|
* @returns {AddDocUpdate | AddFileUpdate}
|
|
*/
|
|
function buildAddUpdate(entity, entityId, projectId, projectHistoryId) {
|
|
const ts = new ObjectId(entityId).getTimestamp()
|
|
|
|
const update = {
|
|
pathname: entity.path,
|
|
v: 0, // NOTE: only for sorting
|
|
meta: {
|
|
// source?
|
|
user_id: null, // TODO: assign the update to a system user?
|
|
ts: Number(ts),
|
|
origin: { kind: 'history-migration' },
|
|
},
|
|
projectHistoryId,
|
|
}
|
|
|
|
switch (entity.type) {
|
|
case 'doc': {
|
|
return {
|
|
doc: entityId,
|
|
...update,
|
|
docLines: entity.docLines,
|
|
}
|
|
}
|
|
|
|
case 'file': {
|
|
// TODO: set a hash here?
|
|
return {
|
|
// type: 'external',
|
|
file: entityId,
|
|
...update,
|
|
url: FilestoreHandler._buildUrl(projectId, entityId),
|
|
}
|
|
}
|
|
|
|
default:
|
|
throw new Error('Unknown entity type')
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Build a "delete" update for an entity, with new_pathname set to an empty string.
|
|
* This represents a doc or file being deleted from a project.
|
|
*
|
|
* @param {Object} entity
|
|
* @param {string} entityId
|
|
* @param {string} projectId
|
|
* @param {string} projectHistoryId
|
|
* @returns DeleteUpdate
|
|
*/
|
|
function buildDeleteUpdate(entity, entityId, projectId, projectHistoryId) {
|
|
const ts = entity.deletedAt || new Date()
|
|
|
|
const update = {
|
|
pathname: entity.path,
|
|
new_pathname: '', // empty path = deletion
|
|
v: Infinity, // NOTE: only for sorting
|
|
meta: {
|
|
user_id: null, // TODO: assign this to a system user?
|
|
ts: Number(ts),
|
|
origin: { kind: 'history-migration' },
|
|
},
|
|
projectHistoryId,
|
|
}
|
|
|
|
switch (entity.type) {
|
|
case 'doc':
|
|
return {
|
|
doc: entityId,
|
|
...update,
|
|
}
|
|
|
|
case 'file':
|
|
return {
|
|
file: entityId,
|
|
...update,
|
|
}
|
|
|
|
default:
|
|
throw new Error(`Unknown entity type ${entity.type}`)
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @typedef TrackedDocUpdateMeta
|
|
* @property {string} user_id
|
|
* @property {number} start_ts
|
|
*/
|
|
|
|
/**
|
|
* @typedef TrackedDocUpdate
|
|
* @property {string} doc_id
|
|
* @property {Array<Object>} op
|
|
* @property {number} v
|
|
* @property {TrackedDocUpdateMeta} meta
|
|
*/
|
|
|
|
/**
|
|
* Build an "edit" update, with op set to an array of operations from track-changes.
|
|
*
|
|
* This represents the contents of a doc being edited in a project.
|
|
*
|
|
* @param {string} projectHistoryId
|
|
* @param {EditDocUpdateStub} updateStub
|
|
* @param {Map.<string, Object>} fileMap
|
|
*
|
|
* @returns {Promise<EditDocUpdate>}
|
|
*/
|
|
async function buildEditDocUpdate(projectHistoryId, updateStub, fileMap) {
|
|
const buffer = await fileMap.get(updateStub.path).buffer()
|
|
|
|
/**
|
|
* @type TrackedDocUpdate
|
|
*/
|
|
const data = JSON.parse(buffer.toString())
|
|
let userId = data.meta.user_id
|
|
if (userId === 'anonymous-user' || userId === 'null') {
|
|
userId = null
|
|
}
|
|
if (userId != null && !/^[0-9a-f]{24}$/.test(userId)) {
|
|
throw new OError('Bad user id in ShareLaTeX history edit update', {
|
|
userId,
|
|
})
|
|
}
|
|
|
|
return {
|
|
doc: data.doc_id,
|
|
op: data.op, // NOTE: this is an array of operations
|
|
v: data.v,
|
|
lastV: data.v - 1,
|
|
meta: {
|
|
user_id: userId,
|
|
ts: data.meta.start_ts, // TODO: use data.meta.end_ts or update.ts?
|
|
pathname: updateStub.pathname,
|
|
doc_length: updateStub.doc_length,
|
|
origin: { kind: 'history-migration' },
|
|
},
|
|
projectHistoryId,
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Build a stub for an "edit" update, with all the metadata but not the actual operations.
|
|
*
|
|
* This represents a doc being edited in a project, with enough information for sorting,
|
|
* but avoids loading the actual operations from the zip archive until they're needed,
|
|
* so as not to run out of memory if the project's history is large.
|
|
*
|
|
* @param {ManifestUpdate} update
|
|
* @param {Entity} entity
|
|
* @param {string} docId
|
|
* @returns {EditDocUpdateStub}
|
|
*/
|
|
function buildEditUpdateStub(update, entity, docId) {
|
|
return {
|
|
stub: true,
|
|
doc: docId,
|
|
v: update.version,
|
|
path: update.path,
|
|
pathname: entity.path,
|
|
doc_length: update.doc_length,
|
|
meta: {
|
|
ts: update.ts,
|
|
origin: { kind: 'history-migration' },
|
|
},
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Build the sorted array of updates to be sent to project-history.
|
|
*
|
|
* 1. Process all the added and edited files from the track-changes archive.
|
|
* 2. Process the other files from the project that have been added, and maybe deleted, without any edits.
|
|
*
|
|
* @param {string} projectId
|
|
* @param {string} projectHistoryId
|
|
* @param {Manifest} manifest
|
|
* @param {Map.<string, Entity>} entities
|
|
* @param {Map.<string, Object>} fileMap
|
|
* @returns {Promise<Array<AnyUpdate>>}
|
|
*/
|
|
async function buildUpdates(
|
|
projectId,
|
|
projectHistoryId,
|
|
manifest,
|
|
entities,
|
|
fileMap
|
|
) {
|
|
/**
|
|
* @type Array<AnyUpdate>
|
|
*/
|
|
const updates = []
|
|
|
|
// keep a list of doc ids which have updates in track-changes
|
|
const updatedDocs = new Set()
|
|
|
|
// process the existing docs with updates, from track-changes
|
|
for (const doc of manifest.docs) {
|
|
const entity = entities.get(doc.id)
|
|
|
|
if (!entity) {
|
|
throw new Error(`Entity not found for ${doc.id}`)
|
|
}
|
|
|
|
if (!entity.path) {
|
|
throw new Error(`Path not found for ${doc.id}`)
|
|
}
|
|
|
|
// add the initial content
|
|
const contentStart = doc.content.start
|
|
|
|
const buffer = await fileMap.get(contentStart.path).buffer()
|
|
|
|
/**
|
|
* @type AddDocUpdate
|
|
*/
|
|
const update = {
|
|
doc: doc.id,
|
|
pathname: entity.path,
|
|
v: contentStart.version - 1,
|
|
meta: {
|
|
user_id: null, // TODO: assign this to a system user?
|
|
ts: Number(ObjectId(doc.id).getTimestamp()),
|
|
origin: { kind: 'history-migration' },
|
|
},
|
|
projectHistoryId,
|
|
docLines: buffer.toString(),
|
|
}
|
|
|
|
updates.push(update)
|
|
|
|
// push the update onto the array of updates
|
|
for (const update of doc.updates) {
|
|
updates.push(buildEditUpdateStub(update, entity, doc.id))
|
|
}
|
|
|
|
updatedDocs.add(doc.id)
|
|
}
|
|
|
|
// process the docs which have been added/deleted without any updates being recorded
|
|
for (const [id, entity] of entities.entries()) {
|
|
if (entity.deleted) {
|
|
// deleted entity
|
|
|
|
// add the doc/file
|
|
if (!updatedDocs.has(id)) {
|
|
updates.push(buildAddUpdate(entity, id, projectId, projectHistoryId))
|
|
}
|
|
|
|
// delete the doc/file again (there may be updates added between adding and deleting)
|
|
updates.push(buildDeleteUpdate(entity, id, projectId, projectHistoryId))
|
|
} else {
|
|
if (!updatedDocs.has(id)) {
|
|
// add "not deleted" doc that isn't in the manifest either
|
|
updates.push(buildAddUpdate(entity, id, projectId, projectHistoryId))
|
|
}
|
|
}
|
|
}
|
|
|
|
return updates
|
|
}
|
|
|
|
/**
|
|
* Remove the `overleaf.history` object from the project and tell project-history to delete everything for this project.
|
|
* (note: project-history may not delete the actual history data yet, but it will at least delete the cached history id)
|
|
*
|
|
* @param {string} projectId
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async function deleteProjectHistory(projectId) {
|
|
await HistoryManager.promises.deleteProjectHistory(projectId)
|
|
// TODO: send a message to document-updater?
|
|
await ProjectHistoryHandler.unsetHistory(projectId)
|
|
}
|
|
|
|
/**
|
|
* Send the updates from the track changes zip file to project history
|
|
*
|
|
* @param {string} projectId
|
|
* @param {string} projectHistoryId
|
|
* @param {Array<AnyUpdate>} updates
|
|
* @param {Map.<string, Object>} fileMap
|
|
*/
|
|
async function migrateTrackChangesUpdates(
|
|
projectId,
|
|
projectHistoryId,
|
|
updates,
|
|
fileMap
|
|
) {
|
|
// Build a queue for each doc, sorted by version (and by timestamp within each version)
|
|
const queues = await sortUpdatesByQueue(updates)
|
|
|
|
const sortedUpdates = []
|
|
|
|
let item
|
|
do {
|
|
// Find the earliest item from the tail of all queues
|
|
queues.sort(earliestTimestampFirst)
|
|
item = queues[0].pop()
|
|
if (item) {
|
|
sortedUpdates.push(item)
|
|
}
|
|
} while (item)
|
|
|
|
// NOTE: leaving the version string code commented out, in case it ends up being needed
|
|
// let majorVersion = 0
|
|
// let minorVersion = 0
|
|
for (const update of sortedUpdates) {
|
|
// increment majorVersion if this is a file change
|
|
if (!('op' in update)) {
|
|
// remove v (only used for sorting)
|
|
delete update.v
|
|
|
|
// set version
|
|
// majorVersion++
|
|
// // minorVersion = 0
|
|
// update.version = `${majorVersion}.${minorVersion}` // NOTE: not set as project-history doesn't need it and could cause problems if it gets higher than project.version
|
|
}
|
|
// increment minorVersion after every update
|
|
// minorVersion++
|
|
}
|
|
|
|
// add each update to the Redis queue for project-history to process
|
|
logger.debug(
|
|
{ projectId, projectHistoryId },
|
|
'Sending updates for project to Redis'
|
|
)
|
|
|
|
const remainingQueueLength = await sendUpdatesToProjectHistory(
|
|
sortedUpdates,
|
|
projectId,
|
|
projectHistoryId,
|
|
fileMap
|
|
)
|
|
// Failure will cause queued updates to be deleted (in the catch below)
|
|
|
|
logger.debug(
|
|
{
|
|
projectId,
|
|
projectHistoryId,
|
|
remainingQueueLength,
|
|
},
|
|
'Updates sent to project-history'
|
|
)
|
|
|
|
if (remainingQueueLength > 0) {
|
|
throw new Error('flush to project-history did not complete')
|
|
}
|
|
|
|
// TODO: roll back if any of the following fail?
|
|
|
|
// TODO: check that the Redis queue is empty?
|
|
|
|
// Clear any old entries in the main project history queue (these will not
|
|
// have a history id)
|
|
await HistoryManager.promises.flushProject(projectId)
|
|
}
|
|
|
|
/**
|
|
* Add the zip file from track changes to the project file tree.
|
|
* We may be able to recover a failed history from the zip file in future.
|
|
*
|
|
* @param {string} projectId
|
|
* @param {string} rootFolderId
|
|
* @param {string} tempFilePath
|
|
*/
|
|
|
|
async function uploadTrackChangesArchiveToProject(
|
|
projectId,
|
|
rootFolderId,
|
|
tempFilePath
|
|
) {
|
|
const { size } = await fs.promises.stat(tempFilePath)
|
|
if (size > settings.maxUploadSize) {
|
|
throw new FileTooLargeError({
|
|
message: 'track-changes archive exceeds maximum size for archiving',
|
|
info: { size },
|
|
})
|
|
}
|
|
const { fileRef } = await ProjectEntityUpdateHandler.promises.addFile(
|
|
projectId,
|
|
rootFolderId, // project.rootFolder[0]._id,
|
|
`OverleafHistory-${new Date().toISOString().substring(0, 10)}.zip`,
|
|
tempFilePath,
|
|
null,
|
|
null, // no owner
|
|
null // no source
|
|
)
|
|
logger.debug(
|
|
{ projectId, fileRef },
|
|
'Uploaded track-changes zip archive to project due to error in migration'
|
|
)
|
|
}
|
|
|
|
/**
|
|
* Check all updates for invalid characters (nonBMP or null) and substitute
|
|
* the unicode replacement character if options.fixInvalidCharacters is true,
|
|
* otherwise throw an exception.
|
|
* @param {Array<AnyUpdate>} updates
|
|
* @param {string} projectId
|
|
* @param {Object} options
|
|
*/
|
|
function validateUpdates(updates, projectId, options) {
|
|
const replace = options.fixInvalidCharacters
|
|
// check for invalid characters
|
|
function containsBadChars(str) {
|
|
return /[\uD800-\uDBFF]/.test(str) || str.indexOf('\x00') !== -1
|
|
}
|
|
// Replace invalid characters so that they will be accepted by history_v1.
|
|
function sanitise(str) {
|
|
if (replace) {
|
|
return str.replace(/[\uD800-\uDFFF]/g, '\uFFFD').replace('\x00', '\uFFFD')
|
|
} else {
|
|
throw new Error('invalid character in content')
|
|
}
|
|
}
|
|
// Check size of doclines in update against max size allowed by history_v1.
|
|
// This catches docs which are too large when created, but not when they
|
|
// go over the limit due to edits.
|
|
function checkSize(update) {
|
|
if (update?.docLines?.length > settings.max_doc_length) {
|
|
throw new FileTooLargeError({
|
|
message: 'docLines exceeds maximum size for history',
|
|
info: { docId: update.doc, size: update.docLines.length },
|
|
})
|
|
}
|
|
}
|
|
let latestTimestamp = 0
|
|
// Iterate over the all the updates and their doclines or ops
|
|
for (const update of updates) {
|
|
checkSize(update)
|
|
// Find the timestamp of the most recent edit (either adding a doc or editing a doc)
|
|
// we exclude deletions as these are created in the migration and we didn't record
|
|
// the deletion time for older files.
|
|
const isDeleteUpdate = update.new_pathname === ''
|
|
if (
|
|
update.doc &&
|
|
!isDeleteUpdate &&
|
|
update.meta.ts &&
|
|
update.meta.ts > latestTimestamp
|
|
) {
|
|
latestTimestamp = update.meta.ts
|
|
}
|
|
if (update.docLines && containsBadChars(update.docLines)) {
|
|
logger.debug({ update, replace }, 'invalid character in docLines')
|
|
update.docLines = sanitise(update.docLines)
|
|
}
|
|
if (update.op) {
|
|
for (const op of update.op) {
|
|
if (op.i && containsBadChars(op.i)) {
|
|
logger.debug({ update, replace }, 'invalid character in insert op')
|
|
op.i = sanitise(op.i)
|
|
}
|
|
if (op.d && containsBadChars(op.d)) {
|
|
logger.debug({ update, replace }, 'invalid character in delete op')
|
|
op.d = sanitise(op.d)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
logger.debug(
|
|
{ projectId, latestTimestamp, date: new Date(latestTimestamp) },
|
|
'timestamp of most recent edit'
|
|
)
|
|
if (options.cutoffDate && new Date(latestTimestamp) > options.cutoffDate) {
|
|
throw new Error('project was edited after cutoff date')
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Migrate a project's history from track-changes to project-history
|
|
*
|
|
* @param {string} projectId
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async function migrateProjectHistory(projectId, options = {}) {
|
|
await fse.ensureDir(settings.path.projectHistories)
|
|
const projectHistoriesDir = await fs.promises.realpath(
|
|
settings.path.projectHistories
|
|
)
|
|
const tempDir = await fs.promises.mkdtemp(projectHistoriesDir + path.sep)
|
|
const tempFilePath = path.join(tempDir, 'project.zip')
|
|
|
|
try {
|
|
// fetch the zip archive of rewound content and updates from track-changes
|
|
// store the zip archive to disk, open it and build a Map of the entries
|
|
if (options.importZipFilePath) {
|
|
// use an existing track-changes archive on disk
|
|
logger.debug(
|
|
{ src: options.importZipFilePath, dst: tempFilePath },
|
|
'importing zip file'
|
|
)
|
|
await fs.promises.copyFile(options.importZipFilePath, tempFilePath)
|
|
const { size } = await fs.promises.stat(tempFilePath)
|
|
logger.info({ projectId, size }, 'imported zip file from disk')
|
|
} else {
|
|
await fetchTrackChangesArchive(projectId, tempFilePath)
|
|
}
|
|
const fileMap = await openTrackChangesArchive(tempFilePath)
|
|
|
|
// read the manifest from the zip archive
|
|
const manifest = await readTrackChangesManifest(fileMap)
|
|
|
|
// check that the project id in the manifest matches
|
|
// to be sure we are using the correct zip file
|
|
if (manifest.projectId !== projectId) {
|
|
throw new Error(`Incorrect projectId: ${manifest.projectId}`)
|
|
}
|
|
|
|
// load the Project from MongoDB
|
|
const project = await ProjectGetter.promises.getProject(projectId)
|
|
|
|
// create a history id for this project
|
|
const oldProjectHistoryId = _.get(project, 'overleaf.history.id')
|
|
|
|
// throw an error if there is already a history associated with the project
|
|
if (oldProjectHistoryId) {
|
|
throw new Error(
|
|
`Project ${projectId} already has history ${oldProjectHistoryId}`
|
|
)
|
|
}
|
|
|
|
try {
|
|
// initialize a new project history and use the history id
|
|
// NOTE: not setting the history id on the project yet
|
|
const projectHistoryId = await HistoryManager.promises.initializeProject(
|
|
projectId
|
|
)
|
|
|
|
try {
|
|
// build a Map of the entities (docs and fileRefs) currently in the project,
|
|
// with _id as the key
|
|
const entities = await processRootFolder(project)
|
|
|
|
// find all the deleted docs for this project and add them to the entity map
|
|
await readDeletedDocs(entities, projectId)
|
|
|
|
// find all the deleted files for this project and add them to the entity map
|
|
await readDeletedFiles(entities, projectId)
|
|
|
|
// check that the paths will not be rejected
|
|
validatePaths(entities, projectId)
|
|
|
|
// build the array of updates that make up the new history for this project
|
|
const updates = await buildUpdates(
|
|
projectId,
|
|
projectHistoryId,
|
|
manifest,
|
|
entities,
|
|
fileMap
|
|
)
|
|
|
|
// check that the updates don't contain any characters that will be rejected by history_v1.
|
|
validateUpdates(updates, projectId, options)
|
|
|
|
if (updates.length) {
|
|
await migrateTrackChangesUpdates(
|
|
projectId,
|
|
projectHistoryId,
|
|
updates,
|
|
fileMap
|
|
)
|
|
}
|
|
} catch (error) {
|
|
if (options?.archiveOnFailure) {
|
|
// on error, optionally store the zip file in the project for future reference
|
|
logger.debug(
|
|
{ projectId, error },
|
|
'Error sending track-changes updates to project history, attempting to archive zip file in project'
|
|
)
|
|
try {
|
|
await uploadTrackChangesArchiveToProject(
|
|
projectId,
|
|
project.rootFolder[0]._id,
|
|
tempFilePath
|
|
)
|
|
} catch (error) {
|
|
if (error instanceof InvalidNameError) {
|
|
logger.info({ projectId }, 'zip file already archived in project')
|
|
} else {
|
|
throw error
|
|
}
|
|
} finally {
|
|
// roll back the last updated timestamp and user
|
|
logger.debug(
|
|
{ projectId },
|
|
'rolling back last updated time after uploading zip file'
|
|
)
|
|
await ProjectUpdateHandler.promises.resetUpdated(
|
|
projectId,
|
|
project.lastUpdated,
|
|
project.lastUpdatedBy
|
|
)
|
|
}
|
|
// set the overleaf.history.zipFileArchivedInProject flag for future reference
|
|
await ProjectHistoryHandler.promises.setMigrationArchiveFlag(
|
|
projectId
|
|
)
|
|
// we consider archiving the zip file as "success" (at least we've given up on attempting
|
|
// to migrate the history) so we don't rethrow the error and continue to initialise the new
|
|
// empty history below.
|
|
} else {
|
|
// if we're not archiving the zip file then we rethrown the error to fail the migration
|
|
throw error
|
|
}
|
|
}
|
|
|
|
// set the project's history id once the updates have been successfully processed
|
|
// (or we have given up and archived the zip file in the project).
|
|
logger.debug(
|
|
{ projectId, projectHistoryId },
|
|
'Setting history id on project'
|
|
)
|
|
await ProjectHistoryHandler.promises.setHistoryId(
|
|
projectId,
|
|
projectHistoryId
|
|
)
|
|
|
|
try {
|
|
// tell document updater to reload docs with the new history id
|
|
logger.debug({ projectId }, 'Asking document-updater to clear project')
|
|
await DocumentUpdaterHandler.promises.flushProjectToMongoAndDelete(
|
|
projectId
|
|
)
|
|
|
|
// run a project history resync in case any changes have arrived since the migration
|
|
logger.debug(
|
|
{ projectId },
|
|
'Asking project-history to force resync project'
|
|
)
|
|
|
|
await HistoryManager.promises.resyncProject(projectId, {
|
|
force: true,
|
|
origin: { kind: 'history-migration' },
|
|
})
|
|
} catch (error) {
|
|
if (options.forceNewHistoryOnFailure) {
|
|
logger.warn(
|
|
{ projectId },
|
|
'failed to resync project, forcing new history'
|
|
)
|
|
} else {
|
|
throw error
|
|
}
|
|
}
|
|
// set the display to v2 history
|
|
logger.debug(
|
|
{ projectId },
|
|
'Switching on full project history display for project'
|
|
)
|
|
await ProjectHistoryHandler.promises.upgradeHistory(projectId, true)
|
|
} catch (error) {
|
|
// delete the history id again if something failed?
|
|
logger.warn(
|
|
OError.tag(
|
|
error,
|
|
'Something went wrong flushing and resyncing project; clearing full project history for project',
|
|
{ projectId }
|
|
)
|
|
)
|
|
await deleteProjectHistory(projectId)
|
|
|
|
throw error
|
|
}
|
|
} finally {
|
|
// clean up the temporary directory
|
|
await fse.remove(tempDir)
|
|
}
|
|
}
|