MongoManager = require "./MongoManager" PackManager = require "./PackManager" RedisManager = require "./RedisManager" UpdateCompressor = require "./UpdateCompressor" LockManager = require "./LockManager" WebApiManager = require "./WebApiManager" UpdateTrimmer = require "./UpdateTrimmer" logger = require "logger-sharelatex" async = require "async" _ = require "underscore" Settings = require "settings-sharelatex" keys = Settings.redis.lock.key_schema module.exports = UpdatesManager = compressAndSaveRawUpdates: (project_id, doc_id, rawUpdates, temporary, callback = (error) ->) -> length = rawUpdates.length if length == 0 return callback() # check that ops are in the correct order for op, i in rawUpdates when i > 0 thisVersion = op?.v prevVersion = rawUpdates[i-1]?.v if not (prevVersion < thisVersion) logger.error project_id: project_id, doc_id: doc_id, rawUpdates:rawUpdates, temporary: temporary, thisVersion:thisVersion, prevVersion:prevVersion, "op versions out of order" # FIXME: we no longer need the lastCompressedUpdate, so change functions not to need it # CORRECTION: we do use it to log the time in case of error MongoManager.peekLastCompressedUpdate doc_id, (error, lastCompressedUpdate, lastVersion) -> # lastCompressedUpdate is the most recent update in Mongo, and # lastVersion is its sharejs version number. # # The peekLastCompressedUpdate method may pass the update back # as 'null' (for example if the previous compressed update has # been archived). In this case it can still pass back the # lastVersion from the update to allow us to check consistency. return callback(error) if error? # Ensure that raw updates start where lastVersion left off if lastVersion? discardedUpdates = [] rawUpdates = rawUpdates.slice(0) while rawUpdates[0]? and rawUpdates[0].v <= lastVersion discardedUpdates.push rawUpdates.shift() if discardedUpdates.length logger.error project_id: project_id, doc_id: doc_id, discardedUpdates: discardedUpdates, temporary: temporary, lastVersion: lastVersion, "discarded updates already present" if rawUpdates[0]? and rawUpdates[0].v != lastVersion + 1 ts = lastCompressedUpdate?.meta?.end_ts last_timestamp = if ts? then new Date(ts) else 'unknown time' error = new Error("Tried to apply raw op at version #{rawUpdates[0].v} to last compressed update with version #{lastVersion} from #{last_timestamp}") logger.error err: error, doc_id: doc_id, project_id: project_id, prev_end_ts: ts, temporary: temporary, lastCompressedUpdate: lastCompressedUpdate, "inconsistent doc versions" if Settings.trackchanges?.continueOnError and rawUpdates[0].v > lastVersion + 1 # we have lost some ops - continue to write into the database, we can't recover at this point lastCompressedUpdate = null else return callback error if rawUpdates.length == 0 return callback() # some old large ops in redis need to be rejected, they predate # the size limit that now prevents them going through the system REJECT_LARGE_OP_SIZE = 4 * 1024 * 1024 for rawUpdate in rawUpdates opSizes = ((op.i?.length || op.d?.length) for op in rawUpdate?.op or []) size = _.max opSizes if size > REJECT_LARGE_OP_SIZE error = new Error("dropped op exceeding maximum allowed size of #{REJECT_LARGE_OP_SIZE}") logger.error err: error, doc_id: doc_id, project_id: project_id, size: size, rawUpdate: rawUpdate, "dropped op - too big" rawUpdate.op = [] compressedUpdates = UpdateCompressor.compressRawUpdates null, rawUpdates PackManager.insertCompressedUpdates project_id, doc_id, lastCompressedUpdate, compressedUpdates, temporary, (error, result) -> return callback(error) if error? logger.log {project_id, doc_id, orig_v: lastCompressedUpdate?.v, new_v: result.v}, "inserted updates into pack" if result? callback() # Check whether the updates are temporary (per-project property) _prepareProjectForUpdates: (project_id, callback = (error, temporary) ->) -> UpdateTrimmer.shouldTrimUpdates project_id, (error, temporary) -> return callback(error) if error? callback(null, temporary) # Check for project id on document history (per-document property) _prepareDocForUpdates: (project_id, doc_id, callback = (error) ->) -> MongoManager.backportProjectId project_id, doc_id, (error) -> return callback(error) if error? callback(null) # Apply updates for specific project/doc after preparing at project and doc level REDIS_READ_BATCH_SIZE: 100 processUncompressedUpdates: (project_id, doc_id, temporary, callback = (error) ->) -> # get the updates as strings from redis (so we can delete them after they are applied) RedisManager.getOldestDocUpdates doc_id, UpdatesManager.REDIS_READ_BATCH_SIZE, (error, docUpdates) -> return callback(error) if error? length = docUpdates.length # parse the redis strings into ShareJs updates RedisManager.expandDocUpdates docUpdates, (error, rawUpdates) -> if error? logger.err project_id: project_id, doc_id: doc_id, docUpdates: docUpdates, "failed to parse docUpdates" return callback(error) logger.log project_id: project_id, doc_id: doc_id, rawUpdates: rawUpdates, "retrieved raw updates from redis" UpdatesManager.compressAndSaveRawUpdates project_id, doc_id, rawUpdates, temporary, (error) -> return callback(error) if error? logger.log project_id: project_id, doc_id: doc_id, "compressed and saved doc updates" # delete the applied updates from redis RedisManager.deleteAppliedDocUpdates project_id, doc_id, docUpdates, (error) -> return callback(error) if error? if length == UpdatesManager.REDIS_READ_BATCH_SIZE # There might be more updates logger.log project_id: project_id, doc_id: doc_id, "continuing processing updates" setTimeout () -> UpdatesManager.processUncompressedUpdates project_id, doc_id, temporary, callback , 0 else logger.log project_id: project_id, doc_id: doc_id, "all raw updates processed" callback() # Process updates for a doc when we flush it individually processUncompressedUpdatesWithLock: (project_id, doc_id, callback = (error) ->) -> UpdatesManager._prepareProjectForUpdates project_id, (error, temporary) -> return callback(error) if error? UpdatesManager._processUncompressedUpdatesForDocWithLock project_id, doc_id, temporary, callback # Process updates for a doc when the whole project is flushed (internal method) _processUncompressedUpdatesForDocWithLock: (project_id, doc_id, temporary, callback = (error) ->) -> UpdatesManager._prepareDocForUpdates project_id, doc_id, (error) -> return callback(error) if error? LockManager.runWithLock( keys.historyLock({doc_id}), (releaseLock) -> UpdatesManager.processUncompressedUpdates project_id, doc_id, temporary, releaseLock callback ) # Process all updates for a project, only check project-level information once processUncompressedUpdatesForProject: (project_id, callback = (error) ->) -> RedisManager.getDocIdsWithHistoryOps project_id, (error, doc_ids) -> return callback(error) if error? UpdatesManager._prepareProjectForUpdates project_id, (error, temporary) -> jobs = [] for doc_id in doc_ids do (doc_id) -> jobs.push (cb) -> UpdatesManager._processUncompressedUpdatesForDocWithLock project_id, doc_id, temporary, cb async.parallelLimit jobs, 5, callback # flush all outstanding changes flushAll: (limit, callback = (error, result) ->) -> RedisManager.getProjectIdsWithHistoryOps (error, project_ids) -> return callback(error) if error? logger.log {count: project_ids?.length, project_ids: project_ids}, "found projects" jobs = [] project_ids = _.shuffle project_ids # randomise to avoid hitting same projects each time selectedProjects = if limit < 0 then project_ids else project_ids[0...limit] for project_id in selectedProjects do (project_id) -> jobs.push (cb) -> UpdatesManager.processUncompressedUpdatesForProject project_id, (err) -> return cb(null, {failed: err?, project_id: project_id}) async.series jobs, (error, result) -> return callback(error) if error? failedProjects = (x.project_id for x in result when x.failed) succeededProjects = (x.project_id for x in result when not x.failed) callback(null, {failed: failedProjects, succeeded: succeededProjects, all: project_ids}) getDanglingUpdates: (callback = (error, doc_ids) ->) -> RedisManager.getAllDocIdsWithHistoryOps (error, all_doc_ids) -> return callback(error) if error? RedisManager.getProjectIdsWithHistoryOps (error, all_project_ids) -> return callback(error) if error? # function to get doc_ids for each project task = (cb) -> async.concatSeries all_project_ids, RedisManager.getDocIdsWithHistoryOps, cb # find the dangling doc ids task (error, project_doc_ids) -> dangling_doc_ids = _.difference(all_doc_ids, project_doc_ids) logger.log {all_doc_ids: all_doc_ids, all_project_ids: all_project_ids, project_doc_ids: project_doc_ids, dangling_doc_ids: dangling_doc_ids}, "checking for dangling doc ids" callback(null, dangling_doc_ids) getDocUpdates: (project_id, doc_id, options = {}, callback = (error, updates) ->) -> UpdatesManager.processUncompressedUpdatesWithLock project_id, doc_id, (error) -> return callback(error) if error? #console.log "options", options PackManager.getOpsByVersionRange project_id, doc_id, options.from, options.to, (error, updates) -> return callback(error) if error? callback null, updates getDocUpdatesWithUserInfo: (project_id, doc_id, options = {}, callback = (error, updates) ->) -> UpdatesManager.getDocUpdates project_id, doc_id, options, (error, updates) -> return callback(error) if error? UpdatesManager.fillUserInfo updates, (error, updates) -> return callback(error) if error? callback null, updates getSummarizedProjectUpdates: (project_id, options = {}, callback = (error, updates) ->) -> options.min_count ||= 25 summarizedUpdates = [] before = options.before nextBeforeTimestamp = null UpdatesManager.processUncompressedUpdatesForProject project_id, (error) -> return callback(error) if error? PackManager.makeProjectIterator project_id, before, (err, iterator) -> return callback(err) if err? # repeatedly get updates and pass them through the summariser to get an final output with user info async.whilst () -> #console.log "checking iterator.done", iterator.done() return summarizedUpdates.length < options.min_count and not iterator.done() , (cb) -> iterator.next (err, partialUpdates) -> return callback(err) if err? #logger.log {partialUpdates}, 'got partialUpdates' return cb() if partialUpdates.length is 0 ## FIXME should try to avoid this happening nextBeforeTimestamp = partialUpdates[partialUpdates.length - 1].meta.end_ts # add the updates to the summary list summarizedUpdates = UpdatesManager._summarizeUpdates partialUpdates, summarizedUpdates cb() , () -> # finally done all updates #console.log 'summarized Updates', summarizedUpdates UpdatesManager.fillSummarizedUserInfo summarizedUpdates, (err, results) -> return callback(err) if err? callback null, results, if not iterator.done() then nextBeforeTimestamp else undefined fetchUserInfo: (users, callback = (error, fetchedUserInfo) ->) -> jobs = [] fetchedUserInfo = {} for user_id of users do (user_id) -> jobs.push (callback) -> WebApiManager.getUserInfo user_id, (error, userInfo) -> return callback(error) if error? fetchedUserInfo[user_id] = userInfo callback() async.series jobs, (err) -> return callback(err) if err? callback(null, fetchedUserInfo) fillUserInfo: (updates, callback = (error, updates) ->) -> users = {} for update in updates user_id = update.meta.user_id if UpdatesManager._validUserId(user_id) users[user_id] = true UpdatesManager.fetchUserInfo users, (error, fetchedUserInfo) -> return callback(error) if error? for update in updates user_id = update.meta.user_id delete update.meta.user_id if UpdatesManager._validUserId(user_id) update.meta.user = fetchedUserInfo[user_id] callback null, updates fillSummarizedUserInfo: (updates, callback = (error, updates) ->) -> users = {} for update in updates user_ids = update.meta.user_ids or [] for user_id in user_ids if UpdatesManager._validUserId(user_id) users[user_id] = true UpdatesManager.fetchUserInfo users, (error, fetchedUserInfo) -> return callback(error) if error? for update in updates user_ids = update.meta.user_ids or [] update.meta.users = [] delete update.meta.user_ids for user_id in user_ids if UpdatesManager._validUserId(user_id) update.meta.users.push fetchedUserInfo[user_id] else update.meta.users.push null callback null, updates _validUserId: (user_id) -> if !user_id? return false else return !!user_id.match(/^[a-f0-9]{24}$/) TIME_BETWEEN_DISTINCT_UPDATES: fiveMinutes = 5 * 60 * 1000 SPLIT_ON_DELETE_SIZE: 16 # characters _summarizeUpdates: (updates, existingSummarizedUpdates = []) -> summarizedUpdates = existingSummarizedUpdates.slice() previousUpdateWasBigDelete = false for update in updates earliestUpdate = summarizedUpdates[summarizedUpdates.length - 1] shouldConcat = false # If a user inserts some text, then deletes a big chunk including that text, # the update we show might concat the insert and delete, and there will be no sign # of that insert having happened, or be able to restore to it (restoring after a big delete is common). # So, we split the summary on 'big' deletes. However, we've stepping backwards in time with # most recent changes considered first, so if this update is a big delete, we want to start # a new summarized update next timge, hence we monitor the previous update. if previousUpdateWasBigDelete shouldConcat = false else if earliestUpdate and earliestUpdate.meta.end_ts - update.meta.start_ts < @TIME_BETWEEN_DISTINCT_UPDATES # We're going backwards in time through the updates, so only combine if this update starts less than 5 minutes before # the end of current summarized block, so no block spans more than 5 minutes. shouldConcat = true isBigDelete = false for op in update.op or [] if op.d? and op.d.length > @SPLIT_ON_DELETE_SIZE isBigDelete = true previousUpdateWasBigDelete = isBigDelete if shouldConcat # check if the user in this update is already present in the earliest update, # if not, add them to the users list of the earliest update earliestUpdate.meta.user_ids = _.union earliestUpdate.meta.user_ids, [update.meta.user_id] doc_id = update.doc_id.toString() doc = earliestUpdate.docs[doc_id] if doc? doc.fromV = Math.min(doc.fromV, update.v) doc.toV = Math.max(doc.toV, update.v) else earliestUpdate.docs[doc_id] = fromV: update.v toV: update.v earliestUpdate.meta.start_ts = Math.min(earliestUpdate.meta.start_ts, update.meta.start_ts) earliestUpdate.meta.end_ts = Math.max(earliestUpdate.meta.end_ts, update.meta.end_ts) else newUpdate = meta: user_ids: [] start_ts: update.meta.start_ts end_ts: update.meta.end_ts docs: {} newUpdate.docs[update.doc_id.toString()] = fromV: update.v toV: update.v newUpdate.meta.user_ids.push update.meta.user_id summarizedUpdates.push newUpdate return summarizedUpdates