2023-01-27 06:48:40 -05:00
const _ = require ( 'lodash' )
const settings = require ( '@overleaf/settings' )
const OError = require ( '@overleaf/o-error' )
const fs = require ( 'fs' )
const fse = require ( 'fs-extra' )
const { ObjectId } = require ( 'mongodb' )
const request = require ( 'request' )
const { pipeline } = require ( 'stream' )
const unzipper = require ( 'unzipper' )
const util = require ( 'util' )
const logger = require ( '@overleaf/logger' )
const path = require ( 'path' )
const {
FileTooLargeError ,
InvalidNameError ,
} = require ( '../../../../app/src/Features/Errors/Errors' )
const FilestoreHandler = require ( '../../../../app/src/Features/FileStore/FileStoreHandler' )
const ProjectGetter = require ( '../../../../app/src/Features/Project/ProjectGetter' )
const RedisWrapper = require ( '../../../../app/src/infrastructure/RedisWrapper' )
const HistoryManager = require ( '../../../../app/src/Features/History/HistoryManager' )
const ProjectHistoryHandler = require ( '../../../../app/src/Features/Project/ProjectHistoryHandler' )
const ProjectUpdateHandler = require ( '../../../../app/src/Features/Project/ProjectUpdateHandler' )
const DocumentUpdaterHandler = require ( '../../../../app/src/Features/DocumentUpdater/DocumentUpdaterHandler' )
const ProjectEntityHandler = require ( '../../../../app/src/Features/Project/ProjectEntityHandler' )
const ProjectEntityUpdateHandler = require ( '../../../../app/src/Features/Project/ProjectEntityUpdateHandler' )
const SafePath = require ( '../../../../app/src/Features/Project/SafePath' )
const { DeletedFile } = require ( '../../../../app/src/models/DeletedFile' )
const { Doc } = require ( '../../../../app/src/models/Doc' )
const {
iterablePaths ,
} = require ( '../../../../app/src/Features/Project/IterablePath' )
const rclient = RedisWrapper . client ( 'project_history_migration' )
module . exports = { deleteProjectHistory , migrateProjectHistory }
/ * *
* @ typedef { Object } UpdateMeta
* @ property { string | null } user _id the id of the user that performed the update
* @ property { number } ts the timestamp of the update
* /
/ * *
* @ typedef { UpdateMeta } EditDocUpdateMeta
* @ property { string | null } user _id
* @ property { number } ts
* @ property { string } pathname the doc pathname
* @ property { number } doc _length the length of the doc
* /
/ * *
* @ typedef { Object } Update
* @ property { string } pathname the path in the file tree
* @ property { UpdateMeta } meta
// * @property {string} version a two-part version. The first part is the project version after the updates, as recorded in Mongo. The second part is a counter that increments for each update in this batch.
* @ property { string } projectHistoryId the v1 history id for this project
* @ property { number } v
* /
/ * *
* @ typedef { Update } FileUpdate
* @ property { string } pathname
* @ property { UpdateMeta } meta
* @ property { string } projectHistoryId
* @ property { number } v
* @ property { string } file
* /
/ * *
* @ typedef { FileUpdate } AddFileUpdate
* @ property { string } pathname
* @ property { UpdateMeta } meta
* @ property { string } projectHistoryId
* @ property { number } v
* @ property { string } file
* @ property { string } url
* /
/ * *
* @ typedef { Update } DocUpdate
* @ property { UpdateMeta } meta
* @ property { string } projectHistoryId
* @ property { number } v
* @ property { string } doc
* /
/ * *
* @ typedef { DocUpdate } AddDocUpdate
* @ property { string } pathname
* @ property { UpdateMeta } meta
* @ property { string } projectHistoryId
* @ property { number } v
* @ property { string } doc
* @ property { string } docLines
* @ property { string } docLinesId
* @ property { boolean } contentStored
* /
/ * *
* @ typedef { DocUpdate } EditDocUpdate
* @ property { EditDocUpdateMeta } meta
* @ property { string } projectHistoryId
* @ property { number } v
* @ property { number } lastV
* @ property { string } doc
* @ property { Array < Object > } op
* /
/ * *
* @ typedef { AddDocUpdate | AddFileUpdate } AddUpdate
* /
/ * *
* @ typedef { DocUpdate | FileUpdate } DeleteUpdate
* @ property { string } pathname
* @ property { UpdateMeta } meta
* @ property { string } projectHistoryId
* @ property { number } v
* @ property { string } doc
* @ property { string } new _pathname
* /
/ * *
* @ typedef { Update } EditDocUpdateStub
* @ property { true } stub
* @ property { string } path
* @ property { string } pathname
* @ property { number } v
* @ property { number } doc _length
* /
/ * *
* @ typedef { AddUpdate | DeleteUpdate | EditDocUpdate | EditDocUpdateStub } AnyUpdate
* /
/ * *
* @ typedef { Object } Project
* @ property { string } _id the id of the user that performed the update
* @ property { Object } overleaf
* /
/ * *
* @ typedef ManifestUpdate
* @ property { string } path
* @ property { number } doc _length
* @ property { number } ts
* @ property { number } version
* /
/ * *
* @ typedef ManifestContent
* @ property { number } start
* /
/ * *
* @ typedef ManifestDoc
* @ property { string } id
* @ property { ManifestContent } content
* @ property { Array < ManifestUpdate > } updates
* /
/ * *
* @ typedef { Object } Manifest
* @ property { string } projectId
* @ property { Array < ManifestDoc > } docs
* /
/ * *
* @ typedef Entity
* @ property { string } type
* @ property { string } path
* @ property { string } docLines
* @ property { string } deletedAt
* @ property { boolean } deleted
* /
/ * *
* Iterate recursively through the folders in project . rootFolder ,
* building a map of all the docs ( with content as a docLines string )
* and files ( with content as a filestore URL ) .
*
* @ param { Object } project
* @ returns { Promise < Map < string , Entity >> }
* /
async function processRootFolder ( project ) {
const entities = new Map ( )
async function processFolder ( folder , root = '' ) {
for ( const item of iterablePaths ( folder , 'docs' ) ) {
const doc = await Doc . findOne (
item . _id ,
// only read the fields we need to save memory
{ _id : 1 , inS3 : 1 , lines : 1 , name : 1 }
) . lean ( )
// skip malformed doc entries
if ( ! doc ? . _id ) {
logger . warn ( { doc } , 'skipping doc with missing id' )
continue
}
const id = doc . _id . toString ( )
const docIsInS3 = ! ! doc . inS3
let docLines
if ( docIsInS3 ) {
const docPeek = await ProjectEntityHandler . promises . getDoc (
project . _id ,
item . _id ,
{ peek : true }
)
docLines = docPeek . lines
} else {
docLines = doc . lines
}
if ( ! docLines ) {
throw new Error ( ` no doc lines for doc ${ id } (inS3: ${ docIsInS3 } ) ` )
}
entities . set ( id , {
path : ` ${ root } / ${ item . name } ` , // NOTE: not doc.name, which is "new doc",
type : 'doc' ,
docLines : docLines . join ( '\n' ) ,
} )
}
for ( const item of iterablePaths ( folder , 'fileRefs' ) ) {
const path = ` ${ root } / ${ item . name } `
// skip malformed file entries
if ( ! item ? . _id ) {
logger . warn ( { item } , 'skipping fileRef with missing id' )
continue
}
const id = item . _id . toString ( )
entities . set ( id , {
path ,
type : 'file' ,
url : FilestoreHandler . _buildUrl ( project . _id . toString ( ) , id ) ,
} )
}
for ( const subfolder of iterablePaths ( folder , 'folders' ) ) {
const path = ` ${ root } / ${ subfolder . name } `
await processFolder ( subfolder , path )
}
}
for ( const folder of project . rootFolder ) {
await processFolder ( folder )
}
return entities
}
/ * *
* Read docs deleted from a project , from the Doc collection ,
* and add them to the entities map with the content in a docLines string .
*
* These entities have a ` deleted ` property set to ` true ` and a ` deletedAt ` date .
*
* @ param { Map < string , Object > } entities
* @ param { string } projectId
* @ returns { Promise < void > }
* /
async function readDeletedDocs ( entities , projectId ) {
// NOTE: could call DocstoreManager.promises.getAllDeletedDocs(projectId) instead
// Look for all docs, since some deleted docs are found in track-changes manifest,
// but do not have deleted flag set for reasons that are unclear
// (we will not add docs to entities if they were previously added by processRootFolder)
const deletedDocsCursor = Doc . find (
{
project _id : ObjectId ( projectId ) ,
} ,
// only read the fields we need to save memory
{ _id : 1 , inS3 : 1 , lines : 1 , name : 1 , deletedAt : 1 }
)
. lean ( )
. cursor ( )
for await ( const doc of deletedDocsCursor ) {
// skip malformed deleted doc entries
if ( ! doc ? . _id ) {
logger . warn ( { doc } , 'skipping deleted doc with missing id' )
continue
}
const id = doc . _id . toString ( )
// Skip doc if we already have an entry in entities
if ( ! entities . has ( id ) ) {
const docIsInS3 = ! ! doc . inS3
let docLines
if ( docIsInS3 ) {
const docPeek = await ProjectEntityHandler . promises . getDoc (
ObjectId ( projectId ) ,
doc . _id ,
{ peek : true }
)
docLines = docPeek . lines
} else {
docLines = doc . lines
}
if ( ! docLines ) {
throw new Error ( ` no doc lines for doc ${ id } (inS3: ${ docIsInS3 } ) ` )
}
// const ts = Number(
// doc.deletedAt ? new Date(doc.deletedAt) : Date.now()
// )
if ( doc . name && ! SafePath . isCleanFilename ( doc . name ) ) {
const newName = SafePath . clean ( doc . name )
logger . warn (
{ projectId , docId : id , origName : doc . name , newName } ,
'renaming invalid deleted file'
)
doc . name = newName
}
entities . set ( id , {
// NOTE: adding the doc id to the file path to avoid collisions
path : ` /_deleted/ ${ id } / ${ doc . name } ` ,
name : doc . name || 'unnamed' , // fallback for improperly deleted docs
deleted : true ,
type : 'doc' ,
deletedAt : doc . deletedAt ,
docLines : docLines . join ( '\n' ) ,
} )
}
}
}
/ * *
* Read files deleted from a project , from the DeletedFile collection ,
* and add them to the entities map .
*
* These entities have a ` deleted ` property set to ` true ` and a ` deletedAt ` date .
* The url is built later , from the project id and file id .
*
* @ param { Map < string , Object > } entities
* @ param { string } projectId
* @ returns { Promise < void > }
* /
async function readDeletedFiles ( entities , projectId ) {
const deletedFilesCursor = DeletedFile . find (
{
projectId : ObjectId ( projectId ) ,
} ,
// only read the fields we need to save memory
{ _id : 1 , name : 1 , deletedAt : 1 }
)
. lean ( )
. cursor ( )
for await ( const file of deletedFilesCursor ) {
// skip malformed deleted file entries
if ( ! file ? . _id ) {
logger . warn ( { file } , 'skipping deleted file with missing id' )
continue
}
const id = file . _id . toString ( )
// TODO: check if it already exists?
if ( ! entities . has ( id ) ) {
// const ts = Number(
// file.deletedAt ? new Date(file.deletedAt) : Date.now()
// )
// TODO: would the hash be useful here?
if ( file . name && ! SafePath . isCleanFilename ( file . name ) ) {
const newName = SafePath . clean ( file . name )
logger . warn (
{ projectId , fileId : id , origName : file . name , newName } ,
'renaming invalid deleted file'
)
file . name = newName
}
entities . set ( id , {
// NOTE: adding the doc id to the file path to avoid collisions
path : ` /_deleted/ ${ id } / ${ file . name } ` ,
name : file . name ,
deleted : true ,
type : 'file' ,
deletedAt : file . deletedAt ,
} )
}
}
}
/ * *
* Iterate through the sorted array of updates , pushing each one to Redis .
*
* In batches , tell project - history to pull the updates from Redis and process them ,
* so the process fails early if something can ' t be processed .
*
* @ param { Array < AnyUpdate > } updates
* @ param { string } projectId
* @ param { string } projectHistoryId
* @ param { Map . < string , Object > } fileMap
* @ returns { Promise < void > }
* /
async function sendUpdatesToProjectHistory (
updates ,
projectId ,
projectHistoryId ,
fileMap
) {
let multi = rclient . multi ( )
let counter = 0
let processed = 0
let size = 0
const projectHistoryKey =
settings . redis . project _history _migration . key _schema . projectHistoryOps ( {
projectId ,
} )
// clear out anything in the Redis queue for this project's history
multi . del ( projectHistoryKey )
for ( let update of updates ) {
// read the content for each update stub from the archive
if ( update . stub ) {
update = await buildEditDocUpdate ( projectHistoryId , update , fileMap )
}
// non-edit doc updates need string timestamps, not numbers
if ( ! ( 'op' in update ) ) {
update . meta . ts = new Date ( update . meta . ts ) . toISOString ( )
}
const updateJSON = JSON . stringify ( update )
multi . rpush ( projectHistoryKey , updateJSON )
counter ++
processed ++
size += updateJSON . length
// flush the history after every 1000 updates and start a new transaction
if ( counter === 1000 ) {
logger . debug (
{ processed , total : updates . length } ,
'sending updates to project history'
)
// execute the transaction
await util . promisify ( multi . exec ) ( )
// tell project-history to pull the updates from the Redis queue
await HistoryManager . promises . flushProject ( projectId ) // TODO: roll back if this fails?
counter = 0
size = 0
multi = rclient . multi ( )
} else if ( size > 1024 * 1024 ) {
// queue entries in redis more frequently to reduce memory usage
await util . promisify ( multi . exec ) ( )
size = 0
multi = rclient . multi ( )
}
}
if ( counter > 0 ) {
// execute the transaction
await util . promisify ( multi . exec ) ( )
// tell project-history to pull the updates from the Redis queue
await HistoryManager . promises . flushProject ( projectId ) // TODO: roll back if this fails?
}
// return the queue length so we can check that it is empty
const queueLength = await rclient . llen ( projectHistoryKey )
return queueLength
}
/ * *
* Compare two arrays of updates , with the earliest timestamp at the end first .
*
* @ param { Array < AnyUpdate > } a
* @ param { Array < AnyUpdate > } b
* @ returns { number }
* /
function earliestTimestampFirst ( a , b ) {
// both arrays are empty, leave them
if ( ! a . length && ! b . length ) {
return 0
}
// a is empty, move b before a
if ( ! a . length ) {
return 1
}
// b is empty, don't move b before a
if ( ! b . length ) {
return - 1
}
const tsB = b [ b . length - 1 ] . meta . ts
const tsA = a [ a . length - 1 ] . meta . ts
// if the last item in b has a lower timestamp that the last item in a, move b above a
if ( tsB < tsA ) {
return 1
}
if ( tsB > tsA ) {
return - 1
}
// use pathnames as secondary sort key, to make order deterministic for
// updates with the same timestamp
const pathnameB = b [ b . length - 1 ] . pathname
const pathnameA = a [ a . length - 1 ] . pathname
if ( pathnameB < pathnameA ) {
return 1
}
if ( pathnameB > pathnameA ) {
return - 1
}
return 0 // shouldn't happen, because pathnames must be distinct
}
/ * *
* Compare two updates , with the highest version number first
*
* @ param { AnyUpdate } a
* @ param { AnyUpdate } b
* @ returns { number }
* /
function decreasingDocVersion ( a , b ) {
if ( b . v === a . v ) {
throw new Error ( ` Matching version: ${ b . v } ${ a . v } ` )
// return 0
}
// if b.v is greater than a.v, sort b above a
return b . v > a . v ? 1 : - 1
}
/ * *
* Create an array of queued updates for each doc / file , sorted by version
*
* @ param { Array < AnyUpdate > } updates
* @ returns { Promise < Array < AnyUpdate >> }
* /
async function sortUpdatesByQueue ( updates ) {
// build a queue of updates for each doc/file
const queues = { }
for ( const update of updates ) {
const docId = update . doc || update . file
if ( ! ( docId in queues ) ) {
queues [ docId ] = [ ]
}
queues [ docId ] . push ( update )
}
// convert the map to an array of queues
const values = Object . values ( queues )
for ( const queue of values ) {
// sort each queue in place, with each update in decreasing version ofder
queue . sort ( decreasingDocVersion )
}
return values
}
/ * *
* Fetch all the content and updates for this project from track - changes , as a zip archive .
*
* @ param { string } projectId
* @ param { string } tempFilePath
* @ returns
* /
async function fetchTrackChangesArchive ( projectId , tempFilePath ) {
const writeStream = fs . createWriteStream ( tempFilePath )
const url = ` ${ settings . apis . trackchanges . url } /project/ ${ projectId } /zip `
// exposed for debugging during full-project-history migration
const timeout =
parseInt ( process . env . FETCH _TRACK _CHANGES _TIMEOUT , 10 ) || 2 * 60 * 1000
try {
await util . promisify ( pipeline ) ( request ( url , { timeout } ) , writeStream )
} catch ( err ) {
logger . error ( { err } , 'Error fetching track changes archive' )
throw err
}
const { size } = await fs . promises . stat ( tempFilePath )
logger . info ( { projectId , size } , 'fetched zip file from track-changes' )
}
/ * *
* Open the zip archive and build a Map of each entry in the archive , with the path as the key
*
* @ param { string } filePath
* @ returns { Promise < Map < string , Object >> }
* /
async function openTrackChangesArchive ( filePath ) {
const directory = await unzipper . Open . file ( filePath )
return new Map ( directory . files . map ( file => [ file . path , file ] ) )
}
/ * *
* Read the manifest data from the zip archive
*
* @ param { Map < string , Object > } fileMap
* @ returns { Promise < Manifest > }
* /
async function readTrackChangesManifest ( fileMap ) {
const manifestBuffer = await fileMap . get ( 'manifest.json' ) . buffer ( )
return JSON . parse ( manifestBuffer . toString ( ) )
}
/ * *
* Check that entities conform to the pathnames allowed by project history
*
* @ param { Map < string , Object > } entities
* @ param { string } projectId
* /
function validatePaths ( entities , projectId ) {
const pathErrors = [ ]
for ( const [ id , entity ] of entities ) {
if ( ! SafePath . isCleanPath ( entity . path ) ) {
pathErrors . push (
` ${ entity . type } : ${ id } ${ entity . deleted ? ' (deleted)' : '' } path: ${
entity . path
} `
)
}
}
if ( pathErrors . length ) {
throw new OError ( 'Invalid path in history migration' , {
projectId ,
pathErrors ,
} )
}
}
/ * *
* Build an "add" update for an entity , with docLines or url set for the content .
* This represents a doc or file being added to a project .
*
* @ param { Object } entity
* @ param { string } entityId
* @ param { string } projectId
* @ param { string } projectHistoryId
*
* @ returns { AddDocUpdate | AddFileUpdate }
* /
function buildAddUpdate ( entity , entityId , projectId , projectHistoryId ) {
const ts = new ObjectId ( entityId ) . getTimestamp ( )
const update = {
pathname : entity . path ,
v : 0 , // NOTE: only for sorting
meta : {
// source?
user _id : null , // TODO: assign the update to a system user?
ts : Number ( ts ) ,
origin : { kind : 'history-migration' } ,
} ,
projectHistoryId ,
}
switch ( entity . type ) {
case 'doc' : {
return {
doc : entityId ,
... update ,
docLines : entity . docLines ,
}
}
case 'file' : {
// TODO: set a hash here?
return {
// type: 'external',
file : entityId ,
... update ,
url : FilestoreHandler . _buildUrl ( projectId , entityId ) ,
}
}
default :
throw new Error ( 'Unknown entity type' )
}
}
/ * *
* Build a "delete" update for an entity , with new _pathname set to an empty string .
* This represents a doc or file being deleted from a project .
*
* @ param { Object } entity
* @ param { string } entityId
* @ param { string } projectId
* @ param { string } projectHistoryId
* @ returns DeleteUpdate
* /
function buildDeleteUpdate ( entity , entityId , projectId , projectHistoryId ) {
const ts = entity . deletedAt || new Date ( )
const update = {
pathname : entity . path ,
new _pathname : '' , // empty path = deletion
v : Infinity , // NOTE: only for sorting
meta : {
user _id : null , // TODO: assign this to a system user?
ts : Number ( ts ) ,
origin : { kind : 'history-migration' } ,
} ,
projectHistoryId ,
}
switch ( entity . type ) {
case 'doc' :
return {
doc : entityId ,
... update ,
}
case 'file' :
return {
file : entityId ,
... update ,
}
default :
throw new Error ( ` Unknown entity type ${ entity . type } ` )
}
}
/ * *
* @ typedef TrackedDocUpdateMeta
* @ property { string } user _id
* @ property { number } start _ts
* /
/ * *
* @ typedef TrackedDocUpdate
* @ property { string } doc _id
* @ property { Array < Object > } op
* @ property { number } v
* @ property { TrackedDocUpdateMeta } meta
* /
/ * *
* Build an "edit" update , with op set to an array of operations from track - changes .
*
* This represents the contents of a doc being edited in a project .
*
* @ param { string } projectHistoryId
* @ param { EditDocUpdateStub } updateStub
* @ param { Map . < string , Object > } fileMap
*
* @ returns { Promise < EditDocUpdate > }
* /
async function buildEditDocUpdate ( projectHistoryId , updateStub , fileMap ) {
const buffer = await fileMap . get ( updateStub . path ) . buffer ( )
/ * *
* @ type TrackedDocUpdate
* /
const data = JSON . parse ( buffer . toString ( ) )
let userId = data . meta . user _id
if ( userId === 'anonymous-user' || userId === 'null' ) {
userId = null
}
if ( userId != null && ! /^[0-9a-f]{24}$/ . test ( userId ) ) {
throw new OError ( 'Bad user id in ShareLaTeX history edit update' , {
userId ,
} )
}
return {
doc : data . doc _id ,
op : data . op , // NOTE: this is an array of operations
v : data . v ,
lastV : data . v - 1 ,
meta : {
user _id : userId ,
ts : data . meta . start _ts , // TODO: use data.meta.end_ts or update.ts?
pathname : updateStub . pathname ,
doc _length : updateStub . doc _length ,
origin : { kind : 'history-migration' } ,
} ,
projectHistoryId ,
}
}
/ * *
* Build a stub for an "edit" update , with all the metadata but not the actual operations .
*
* This represents a doc being edited in a project , with enough information for sorting ,
* but avoids loading the actual operations from the zip archive until they ' re needed ,
* so as not to run out of memory if the project ' s history is large .
*
* @ param { ManifestUpdate } update
* @ param { Entity } entity
* @ param { string } docId
* @ returns { EditDocUpdateStub }
* /
function buildEditUpdateStub ( update , entity , docId ) {
return {
stub : true ,
doc : docId ,
v : update . version ,
path : update . path ,
pathname : entity . path ,
doc _length : update . doc _length ,
meta : {
ts : update . ts ,
origin : { kind : 'history-migration' } ,
} ,
}
}
/ * *
* Build the sorted array of updates to be sent to project - history .
*
* 1. Process all the added and edited files from the track - changes archive .
* 2. Process the other files from the project that have been added , and maybe deleted , without any edits .
*
* @ param { string } projectId
* @ param { string } projectHistoryId
* @ param { Manifest } manifest
* @ param { Map . < string , Entity > } entities
* @ param { Map . < string , Object > } fileMap
* @ returns { Promise < Array < AnyUpdate >> }
* /
async function buildUpdates (
projectId ,
projectHistoryId ,
manifest ,
entities ,
fileMap
) {
/ * *
* @ type Array < AnyUpdate >
* /
const updates = [ ]
// keep a list of doc ids which have updates in track-changes
const updatedDocs = new Set ( )
// process the existing docs with updates, from track-changes
for ( const doc of manifest . docs ) {
const entity = entities . get ( doc . id )
if ( ! entity ) {
throw new Error ( ` Entity not found for ${ doc . id } ` )
}
if ( ! entity . path ) {
throw new Error ( ` Path not found for ${ doc . id } ` )
}
// add the initial content
const contentStart = doc . content . start
const buffer = await fileMap . get ( contentStart . path ) . buffer ( )
/ * *
* @ type AddDocUpdate
* /
const update = {
doc : doc . id ,
pathname : entity . path ,
v : contentStart . version - 1 ,
meta : {
user _id : null , // TODO: assign this to a system user?
ts : Number ( ObjectId ( doc . id ) . getTimestamp ( ) ) ,
origin : { kind : 'history-migration' } ,
} ,
projectHistoryId ,
docLines : buffer . toString ( ) ,
}
updates . push ( update )
// push the update onto the array of updates
for ( const update of doc . updates ) {
updates . push ( buildEditUpdateStub ( update , entity , doc . id ) )
}
updatedDocs . add ( doc . id )
}
// process the docs which have been added/deleted without any updates being recorded
for ( const [ id , entity ] of entities . entries ( ) ) {
if ( entity . deleted ) {
// deleted entity
// add the doc/file
if ( ! updatedDocs . has ( id ) ) {
updates . push ( buildAddUpdate ( entity , id , projectId , projectHistoryId ) )
}
// delete the doc/file again (there may be updates added between adding and deleting)
updates . push ( buildDeleteUpdate ( entity , id , projectId , projectHistoryId ) )
} else {
if ( ! updatedDocs . has ( id ) ) {
// add "not deleted" doc that isn't in the manifest either
updates . push ( buildAddUpdate ( entity , id , projectId , projectHistoryId ) )
}
}
}
return updates
}
/ * *
* Remove the ` overleaf.history ` object from the project and tell project - history to delete everything for this project .
* ( note : project - history may not delete the actual history data yet , but it will at least delete the cached history id )
*
* @ param { string } projectId
* @ returns { Promise < void > }
* /
async function deleteProjectHistory ( projectId ) {
await HistoryManager . promises . deleteProjectHistory ( projectId )
// TODO: send a message to document-updater?
await ProjectHistoryHandler . unsetHistory ( projectId )
}
/ * *
* Send the updates from the track changes zip file to project history
*
* @ param { string } projectId
* @ param { string } projectHistoryId
* @ param { Array < AnyUpdate > } updates
* @ param { Map . < string , Object > } fileMap
* /
async function migrateTrackChangesUpdates (
projectId ,
projectHistoryId ,
updates ,
fileMap
) {
// Build a queue for each doc, sorted by version (and by timestamp within each version)
const queues = await sortUpdatesByQueue ( updates )
const sortedUpdates = [ ]
let item
do {
// Find the earliest item from the tail of all queues
queues . sort ( earliestTimestampFirst )
item = queues [ 0 ] . pop ( )
if ( item ) {
sortedUpdates . push ( item )
}
} while ( item )
// NOTE: leaving the version string code commented out, in case it ends up being needed
// let majorVersion = 0
// let minorVersion = 0
for ( const update of sortedUpdates ) {
// increment majorVersion if this is a file change
if ( ! ( 'op' in update ) ) {
// remove v (only used for sorting)
delete update . v
// set version
// majorVersion++
// // minorVersion = 0
// update.version = `${majorVersion}.${minorVersion}` // NOTE: not set as project-history doesn't need it and could cause problems if it gets higher than project.version
}
// increment minorVersion after every update
// minorVersion++
}
// add each update to the Redis queue for project-history to process
logger . debug (
{ projectId , projectHistoryId } ,
'Sending updates for project to Redis'
)
const remainingQueueLength = await sendUpdatesToProjectHistory (
sortedUpdates ,
projectId ,
projectHistoryId ,
fileMap
)
// Failure will cause queued updates to be deleted (in the catch below)
logger . debug (
{
projectId ,
projectHistoryId ,
remainingQueueLength ,
} ,
'Updates sent to project-history'
)
if ( remainingQueueLength > 0 ) {
throw new Error ( 'flush to project-history did not complete' )
}
// TODO: roll back if any of the following fail?
// TODO: check that the Redis queue is empty?
// Clear any old entries in the main project history queue (these will not
// have a history id)
await HistoryManager . promises . flushProject ( projectId )
}
/ * *
* Add the zip file from track changes to the project file tree .
* We may be able to recover a failed history from the zip file in future .
*
* @ param { string } projectId
* @ param { string } rootFolderId
* @ param { string } tempFilePath
* /
async function uploadTrackChangesArchiveToProject (
projectId ,
rootFolderId ,
tempFilePath
) {
const { size } = await fs . promises . stat ( tempFilePath )
if ( size > settings . maxUploadSize ) {
throw new FileTooLargeError ( {
message : 'track-changes archive exceeds maximum size for archiving' ,
info : { size } ,
} )
}
const { fileRef } = await ProjectEntityUpdateHandler . promises . addFile (
projectId ,
rootFolderId , // project.rootFolder[0]._id,
` OverleafHistory- ${ new Date ( ) . toISOString ( ) . substring ( 0 , 10 ) } .zip ` ,
tempFilePath ,
null ,
null , // no owner
null // no source
)
logger . debug (
{ projectId , fileRef } ,
'Uploaded track-changes zip archive to project due to error in migration'
)
}
/ * *
* Check all updates for invalid characters ( nonBMP or null ) and substitute
* the unicode replacement character if options . fixInvalidCharacters is true ,
* otherwise throw an exception .
* @ param { Array < AnyUpdate > } updates
* @ param { string } projectId
* @ param { Object } options
* /
function validateUpdates ( updates , projectId , options ) {
const replace = options . fixInvalidCharacters
// check for invalid characters
function containsBadChars ( str ) {
return /[\uD800-\uDBFF]/ . test ( str ) || str . indexOf ( '\x00' ) !== - 1
}
// Replace invalid characters so that they will be accepted by history_v1.
function sanitise ( str ) {
if ( replace ) {
return str . replace ( /[\uD800-\uDFFF]/g , '\uFFFD' ) . replace ( '\x00' , '\uFFFD' )
} else {
throw new Error ( 'invalid character in content' )
}
}
// Check size of doclines in update against max size allowed by history_v1.
// This catches docs which are too large when created, but not when they
// go over the limit due to edits.
function checkSize ( update ) {
if ( update ? . docLines ? . length > settings . max _doc _length ) {
throw new FileTooLargeError ( {
message : 'docLines exceeds maximum size for history' ,
info : { docId : update . doc , size : update . docLines . length } ,
} )
}
}
let latestTimestamp = 0
// Iterate over the all the updates and their doclines or ops
for ( const update of updates ) {
checkSize ( update )
// Find the timestamp of the most recent edit (either adding a doc or editing a doc)
// we exclude deletions as these are created in the migration and we didn't record
// the deletion time for older files.
const isDeleteUpdate = update . new _pathname === ''
if (
update . doc &&
! isDeleteUpdate &&
update . meta . ts &&
update . meta . ts > latestTimestamp
) {
latestTimestamp = update . meta . ts
}
if ( update . docLines && containsBadChars ( update . docLines ) ) {
logger . debug ( { update , replace } , 'invalid character in docLines' )
update . docLines = sanitise ( update . docLines )
}
if ( update . op ) {
for ( const op of update . op ) {
if ( op . i && containsBadChars ( op . i ) ) {
logger . debug ( { update , replace } , 'invalid character in insert op' )
op . i = sanitise ( op . i )
}
if ( op . d && containsBadChars ( op . d ) ) {
logger . debug ( { update , replace } , 'invalid character in delete op' )
op . d = sanitise ( op . d )
}
}
}
}
logger . debug (
{ projectId , latestTimestamp , date : new Date ( latestTimestamp ) } ,
'timestamp of most recent edit'
)
if ( options . cutoffDate && new Date ( latestTimestamp ) > options . cutoffDate ) {
throw new Error ( 'project was edited after cutoff date' )
}
}
/ * *
* Migrate a project ' s history from track - changes to project - history
*
* @ param { string } projectId
*
* @ returns { Promise < void > }
* /
async function migrateProjectHistory ( projectId , options = { } ) {
await fse . ensureDir ( settings . path . projectHistories )
const projectHistoriesDir = await fs . promises . realpath (
settings . path . projectHistories
)
const tempDir = await fs . promises . mkdtemp ( projectHistoriesDir + path . sep )
const tempFilePath = path . join ( tempDir , 'project.zip' )
try {
// fetch the zip archive of rewound content and updates from track-changes
// store the zip archive to disk, open it and build a Map of the entries
if ( options . importZipFilePath ) {
// use an existing track-changes archive on disk
logger . debug (
{ src : options . importZipFilePath , dst : tempFilePath } ,
'importing zip file'
)
await fs . promises . copyFile ( options . importZipFilePath , tempFilePath )
const { size } = await fs . promises . stat ( tempFilePath )
logger . info ( { projectId , size } , 'imported zip file from disk' )
} else {
await fetchTrackChangesArchive ( projectId , tempFilePath )
}
const fileMap = await openTrackChangesArchive ( tempFilePath )
// read the manifest from the zip archive
const manifest = await readTrackChangesManifest ( fileMap )
// check that the project id in the manifest matches
// to be sure we are using the correct zip file
if ( manifest . projectId !== projectId ) {
throw new Error ( ` Incorrect projectId: ${ manifest . projectId } ` )
}
// load the Project from MongoDB
const project = await ProjectGetter . promises . getProject ( projectId )
// create a history id for this project
const oldProjectHistoryId = _ . get ( project , 'overleaf.history.id' )
// throw an error if there is already a history associated with the project
if ( oldProjectHistoryId ) {
throw new Error (
` Project ${ projectId } already has history ${ oldProjectHistoryId } `
)
}
try {
// initialize a new project history and use the history id
// NOTE: not setting the history id on the project yet
const projectHistoryId = await HistoryManager . promises . initializeProject (
projectId
)
try {
// build a Map of the entities (docs and fileRefs) currently in the project,
// with _id as the key
const entities = await processRootFolder ( project )
// find all the deleted docs for this project and add them to the entity map
await readDeletedDocs ( entities , projectId )
// find all the deleted files for this project and add them to the entity map
await readDeletedFiles ( entities , projectId )
// check that the paths will not be rejected
validatePaths ( entities , projectId )
// build the array of updates that make up the new history for this project
const updates = await buildUpdates (
projectId ,
projectHistoryId ,
manifest ,
entities ,
fileMap
)
// check that the updates don't contain any characters that will be rejected by history_v1.
validateUpdates ( updates , projectId , options )
if ( updates . length ) {
await migrateTrackChangesUpdates (
projectId ,
projectHistoryId ,
updates ,
fileMap
)
}
} catch ( error ) {
if ( options ? . archiveOnFailure ) {
// on error, optionally store the zip file in the project for future reference
logger . debug (
{ projectId , error } ,
'Error sending track-changes updates to project history, attempting to archive zip file in project'
)
try {
await uploadTrackChangesArchiveToProject (
projectId ,
project . rootFolder [ 0 ] . _id ,
tempFilePath
)
} catch ( error ) {
if ( error instanceof InvalidNameError ) {
logger . info ( { projectId } , 'zip file already archived in project' )
} else {
throw error
}
} finally {
// roll back the last updated timestamp and user
logger . debug (
{ projectId } ,
'rolling back last updated time after uploading zip file'
)
await ProjectUpdateHandler . promises . resetUpdated (
projectId ,
project . lastUpdated ,
project . lastUpdatedBy
)
}
// set the overleaf.history.zipFileArchivedInProject flag for future reference
await ProjectHistoryHandler . promises . setMigrationArchiveFlag (
projectId
)
// we consider archiving the zip file as "success" (at least we've given up on attempting
// to migrate the history) so we don't rethrow the error and continue to initialise the new
// empty history below.
} else {
// if we're not archiving the zip file then we rethrown the error to fail the migration
throw error
}
}
// set the project's history id once the updates have been successfully processed
// (or we have given up and archived the zip file in the project).
logger . debug (
{ projectId , projectHistoryId } ,
'Setting history id on project'
)
await ProjectHistoryHandler . promises . setHistoryId (
projectId ,
projectHistoryId
)
try {
// tell document updater to reload docs with the new history id
logger . debug ( { projectId } , 'Asking document-updater to clear project' )
await DocumentUpdaterHandler . promises . flushProjectToMongoAndDelete (
projectId
)
// run a project history resync in case any changes have arrived since the migration
logger . debug (
{ projectId } ,
'Asking project-history to force resync project'
)
await HistoryManager . promises . resyncProject ( projectId , {
force : true ,
origin : { kind : 'history-migration' } ,
} )
} catch ( error ) {
if ( options . forceNewHistoryOnFailure ) {
logger . warn (
{ projectId } ,
'failed to resync project, forcing new history'
)
} else {
throw error
}
}
logger . debug (
{ projectId } ,
'Switching on full project history display for project'
)
2023-02-07 04:14:41 -05:00
// Set the display to v2 history but allow downgrading (second argument allowDowngrade = true)
2023-01-27 06:48:40 -05:00
await ProjectHistoryHandler . promises . upgradeHistory ( projectId , true )
} catch ( error ) {
// delete the history id again if something failed?
logger . warn (
OError . tag (
error ,
'Something went wrong flushing and resyncing project; clearing full project history for project' ,
{ projectId }
)
)
await deleteProjectHistory ( projectId )
throw error
}
} finally {
// clean up the temporary directory
await fse . remove ( tempDir )
}
}