overleaf/services/track-changes/app/js/UpdatesManager.js

884 lines
27 KiB
JavaScript
Raw Normal View History

/* eslint-disable
no-unused-vars,
*/
// TODO: This file was created by bulk-decaffeinate.
// Fix any style issues and re-enable lint.
/*
* decaffeinate suggestions:
* DS101: Remove unnecessary use of Array.from
* DS102: Remove unnecessary code created because of implicit returns
* DS103: Rewrite code to no longer use __guard__
* DS205: Consider reworking code to avoid use of IIFEs
* DS207: Consider shorter variations of null checks
* Full docs: https://github.com/decaffeinate/decaffeinate/blob/master/docs/suggestions.md
*/
let fiveMinutes, UpdatesManager
const MongoManager = require('./MongoManager')
const PackManager = require('./PackManager')
const RedisManager = require('./RedisManager')
const UpdateCompressor = require('./UpdateCompressor')
const LockManager = require('./LockManager')
const WebApiManager = require('./WebApiManager')
const UpdateTrimmer = require('./UpdateTrimmer')
const logger = require('@overleaf/logger')
const async = require('async')
const _ = require('underscore')
const Settings = require('@overleaf/settings')
const keys = Settings.redis.lock.key_schema
const util = require('util')
module.exports = UpdatesManager = {
compressAndSaveRawUpdates(projectId, docId, rawUpdates, temporary, callback) {
let i
if (callback == null) {
callback = function () {}
}
const { length } = rawUpdates
if (length === 0) {
return callback()
}
// check that ops are in the correct order
for (i = 0; i < rawUpdates.length; i++) {
const op = rawUpdates[i]
if (i > 0) {
const thisVersion = op != null ? op.v : undefined
const prevVersion = rawUpdates[i - 1]?.v
if (!(prevVersion < thisVersion)) {
logger.error(
{
projectId,
docId,
rawUpdates,
temporary,
thisVersion,
2021-07-13 07:04:43 -04:00
prevVersion,
},
'op versions out of order'
)
}
}
}
// FIXME: we no longer need the lastCompressedUpdate, so change functions not to need it
// CORRECTION: we do use it to log the time in case of error
2021-07-13 07:04:43 -04:00
return MongoManager.peekLastCompressedUpdate(
docId,
2021-07-13 07:04:43 -04:00
function (error, lastCompressedUpdate, lastVersion) {
// lastCompressedUpdate is the most recent update in Mongo, and
// lastVersion is its sharejs version number.
//
// The peekLastCompressedUpdate method may pass the update back
// as 'null' (for example if the previous compressed update has
// been archived). In this case it can still pass back the
// lastVersion from the update to allow us to check consistency.
let op
if (error != null) {
return callback(error)
}
2021-07-13 07:04:43 -04:00
// Ensure that raw updates start where lastVersion left off
if (lastVersion != null) {
const discardedUpdates = []
rawUpdates = rawUpdates.slice(0)
while (rawUpdates[0] != null && rawUpdates[0].v <= lastVersion) {
discardedUpdates.push(rawUpdates.shift())
}
2021-07-13 07:04:43 -04:00
if (discardedUpdates.length) {
logger.error(
{ projectId, docId, discardedUpdates, temporary, lastVersion },
2021-07-13 07:04:43 -04:00
'discarded updates already present'
)
}
2021-07-13 07:04:43 -04:00
if (rawUpdates[0] != null && rawUpdates[0].v !== lastVersion + 1) {
const ts = lastCompressedUpdate?.meta?.end_ts
const lastTimestamp = ts != null ? new Date(ts) : 'unknown time'
2021-07-13 07:04:43 -04:00
error = new Error(
`Tried to apply raw op at version ${rawUpdates[0].v} to last compressed update with version ${lastVersion} from ${lastTimestamp}`
2021-07-13 07:04:43 -04:00
)
logger.error(
{
2021-07-13 07:04:43 -04:00
err: error,
docId,
projectId,
prevEndTs: ts,
2021-07-13 07:04:43 -04:00
temporary,
lastCompressedUpdate,
},
2021-07-13 07:04:43 -04:00
'inconsistent doc versions'
)
2021-07-13 07:04:43 -04:00
if (
(Settings.trackchanges != null
? Settings.trackchanges.continueOnError
: undefined) &&
rawUpdates[0].v > lastVersion + 1
) {
// we have lost some ops - continue to write into the database, we can't recover at this point
lastCompressedUpdate = null
} else {
return callback(error)
}
}
2021-07-13 07:04:43 -04:00
}
if (rawUpdates.length === 0) {
return callback()
}
2021-07-13 07:04:43 -04:00
// some old large ops in redis need to be rejected, they predate
// the size limit that now prevents them going through the system
const REJECT_LARGE_OP_SIZE = 4 * 1024 * 1024
for (const rawUpdate of Array.from(rawUpdates)) {
2021-07-13 07:04:43 -04:00
const opSizes = (() => {
const result = []
for (op of Array.from(
(rawUpdate != null ? rawUpdate.op : undefined) || []
)) {
result.push(
(op.i != null ? op.i.length : undefined) ||
(op.d != null ? op.d.length : undefined)
)
}
return result
})()
const size = _.max(opSizes)
if (size > REJECT_LARGE_OP_SIZE) {
error = new Error(
`dropped op exceeding maximum allowed size of ${REJECT_LARGE_OP_SIZE}`
)
logger.error(
{ err: error, docId, projectId, size, rawUpdate },
2021-07-13 07:04:43 -04:00
'dropped op - too big'
)
rawUpdate.op = []
}
}
const compressedUpdates = UpdateCompressor.compressRawUpdates(
null,
rawUpdates
)
return PackManager.insertCompressedUpdates(
projectId,
docId,
2021-07-13 07:04:43 -04:00
lastCompressedUpdate,
compressedUpdates,
temporary,
function (error, result) {
if (error != null) {
return callback(error)
}
if (result != null) {
logger.debug(
2021-07-13 07:04:43 -04:00
{
projectId,
docId,
origV:
2021-07-13 07:04:43 -04:00
lastCompressedUpdate != null
? lastCompressedUpdate.v
: undefined,
newV: result.v,
2021-07-13 07:04:43 -04:00
},
'inserted updates into pack'
)
}
return callback()
}
)
}
)
},
// Check whether the updates are temporary (per-project property)
_prepareProjectForUpdates(projectId, callback) {
if (callback == null) {
callback = function () {}
}
2021-07-13 07:04:43 -04:00
return UpdateTrimmer.shouldTrimUpdates(
projectId,
2021-07-13 07:04:43 -04:00
function (error, temporary) {
if (error != null) {
return callback(error)
}
return callback(null, temporary)
}
2021-07-13 07:04:43 -04:00
)
},
// Check for project id on document history (per-document property)
_prepareDocForUpdates(projectId, docId, callback) {
if (callback == null) {
callback = function () {}
}
return MongoManager.backportProjectId(projectId, docId, function (error) {
if (error != null) {
return callback(error)
}
return callback(null)
})
},
// Apply updates for specific project/doc after preparing at project and doc level
REDIS_READ_BATCH_SIZE: 100,
processUncompressedUpdates(projectId, docId, temporary, callback) {
// get the updates as strings from redis (so we can delete them after they are applied)
if (callback == null) {
callback = function () {}
}
return RedisManager.getOldestDocUpdates(
docId,
UpdatesManager.REDIS_READ_BATCH_SIZE,
2020-06-04 04:24:21 -04:00
function (error, docUpdates) {
if (error != null) {
return callback(error)
}
const { length } = docUpdates
// parse the redis strings into ShareJs updates
2021-07-13 07:04:43 -04:00
return RedisManager.expandDocUpdates(
docUpdates,
function (error, rawUpdates) {
if (error != null) {
logger.err(
{ projectId, docId, docUpdates },
2021-07-13 07:04:43 -04:00
'failed to parse docUpdates'
)
2021-07-13 07:04:43 -04:00
return callback(error)
}
logger.debug(
{ projectId, docId, rawUpdates },
2021-07-13 07:04:43 -04:00
'retrieved raw updates from redis'
)
return UpdatesManager.compressAndSaveRawUpdates(
projectId,
docId,
2021-07-13 07:04:43 -04:00
rawUpdates,
temporary,
function (error) {
if (error != null) {
return callback(error)
}
logger.debug(
{ projectId, docId },
2021-07-13 07:04:43 -04:00
'compressed and saved doc updates'
)
// delete the applied updates from redis
return RedisManager.deleteAppliedDocUpdates(
projectId,
docId,
2021-07-13 07:04:43 -04:00
docUpdates,
function (error) {
if (error != null) {
return callback(error)
}
if (length === UpdatesManager.REDIS_READ_BATCH_SIZE) {
// There might be more updates
logger.debug(
{ projectId, docId },
2021-07-13 07:04:43 -04:00
'continuing processing updates'
)
return setTimeout(
() =>
UpdatesManager.processUncompressedUpdates(
projectId,
docId,
2021-07-13 07:04:43 -04:00
temporary,
callback
),
0
)
} else {
logger.debug(
{ projectId, docId },
2021-07-13 07:04:43 -04:00
'all raw updates processed'
)
return callback()
}
}
)
}
)
}
)
}
)
},
// Process updates for a doc when we flush it individually
processUncompressedUpdatesWithLock(projectId, docId, callback) {
if (callback == null) {
callback = function () {}
}
2021-07-13 07:04:43 -04:00
return UpdatesManager._prepareProjectForUpdates(
projectId,
2021-07-13 07:04:43 -04:00
function (error, temporary) {
if (error != null) {
return callback(error)
}
return UpdatesManager._processUncompressedUpdatesForDocWithLock(
projectId,
docId,
2021-07-13 07:04:43 -04:00
temporary,
callback
)
}
2021-07-13 07:04:43 -04:00
)
},
// Process updates for a doc when the whole project is flushed (internal method)
_processUncompressedUpdatesForDocWithLock(
projectId,
docId,
temporary,
callback
) {
if (callback == null) {
callback = function () {}
}
2021-07-13 07:04:43 -04:00
return UpdatesManager._prepareDocForUpdates(
projectId,
docId,
2021-07-13 07:04:43 -04:00
function (error) {
if (error != null) {
return callback(error)
}
return LockManager.runWithLock(
keys.historyLock({ doc_id: docId }),
2021-07-13 07:04:43 -04:00
releaseLock =>
UpdatesManager.processUncompressedUpdates(
projectId,
docId,
2021-07-13 07:04:43 -04:00
temporary,
releaseLock
),
callback
)
}
2021-07-13 07:04:43 -04:00
)
},
// Process all updates for a project, only check project-level information once
processUncompressedUpdatesForProject(projectId, callback) {
if (callback == null) {
callback = function () {}
}
2021-07-13 07:04:43 -04:00
return RedisManager.getDocIdsWithHistoryOps(
projectId,
function (error, docIds) {
2021-07-13 07:04:43 -04:00
if (error != null) {
return callback(error)
}
2021-07-13 07:04:43 -04:00
return UpdatesManager._prepareProjectForUpdates(
projectId,
2021-07-13 07:04:43 -04:00
function (error, temporary) {
if (error) return callback(error)
2021-07-13 07:04:43 -04:00
const jobs = []
for (const docId of Array.from(docIds)) {
;(docId =>
2021-07-13 07:04:43 -04:00
jobs.push(cb =>
UpdatesManager._processUncompressedUpdatesForDocWithLock(
projectId,
docId,
2021-07-13 07:04:43 -04:00
temporary,
cb
)
))(docId)
2021-07-13 07:04:43 -04:00
}
return async.parallelLimit(jobs, 5, callback)
}
)
}
)
},
// flush all outstanding changes
flushAll(limit, callback) {
if (callback == null) {
callback = function () {}
}
2020-06-04 04:24:21 -04:00
return RedisManager.getProjectIdsWithHistoryOps(function (
error,
projectIds
) {
let projectId
if (error != null) {
return callback(error)
}
logger.debug(
{
count: projectIds != null ? projectIds.length : undefined,
projectIds,
},
'found projects'
)
const jobs = []
projectIds = _.shuffle(projectIds) // randomise to avoid hitting same projects each time
const selectedProjects =
limit < 0 ? projectIds : projectIds.slice(0, limit)
for (projectId of Array.from(selectedProjects)) {
;(projectId =>
2021-07-13 07:04:43 -04:00
jobs.push(cb =>
UpdatesManager.processUncompressedUpdatesForProject(
projectId,
err => cb(null, { failed: err != null, project_id: projectId })
)
))(projectId)
}
2020-06-04 04:24:21 -04:00
return async.series(jobs, function (error, result) {
let x
if (error != null) {
return callback(error)
}
const failedProjects = (() => {
const result1 = []
for (x of Array.from(result)) {
if (x.failed) {
result1.push(x.project_id)
}
}
return result1
})()
const succeededProjects = (() => {
const result2 = []
for (x of Array.from(result)) {
if (!x.failed) {
result2.push(x.project_id)
}
}
return result2
})()
return callback(null, {
failed: failedProjects,
succeeded: succeededProjects,
all: projectIds,
})
})
})
},
getDanglingUpdates(callback) {
if (callback == null) {
callback = function () {}
}
return RedisManager.getAllDocIdsWithHistoryOps(function (error, allDocIds) {
if (error != null) {
return callback(error)
}
2020-06-04 04:24:21 -04:00
return RedisManager.getProjectIdsWithHistoryOps(function (
error,
allProjectIds
) {
if (error != null) {
return callback(error)
}
// function to get doc_ids for each project
2021-07-13 07:04:43 -04:00
const task = cb =>
async.concatSeries(
allProjectIds,
RedisManager.getDocIdsWithHistoryOps,
cb
)
// find the dangling doc ids
return task(function (error, projectDocIds) {
if (error) return callback(error)
const danglingDocIds = _.difference(allDocIds, projectDocIds)
logger.debug(
{ allDocIds, allProjectIds, projectDocIds, danglingDocIds },
'checking for dangling doc ids'
)
return callback(null, danglingDocIds)
})
})
})
},
getDocUpdates(projectId, docId, options, callback) {
if (options == null) {
options = {}
}
if (callback == null) {
callback = function () {}
}
return UpdatesManager.processUncompressedUpdatesWithLock(
projectId,
docId,
2020-06-04 04:24:21 -04:00
function (error) {
if (error != null) {
return callback(error)
}
// console.log "options", options
return PackManager.getOpsByVersionRange(
projectId,
docId,
options.from,
options.to,
2020-06-04 04:24:21 -04:00
function (error, updates) {
if (error != null) {
return callback(error)
}
return callback(null, updates)
}
)
}
)
},
getDocUpdatesWithUserInfo(projectId, docId, options, callback) {
if (options == null) {
options = {}
}
if (callback == null) {
callback = function () {}
}
2021-07-13 07:04:43 -04:00
return UpdatesManager.getDocUpdates(
projectId,
docId,
2021-07-13 07:04:43 -04:00
options,
function (error, updates) {
if (error != null) {
return callback(error)
}
2021-07-13 07:04:43 -04:00
return UpdatesManager.fillUserInfo(updates, function (error, updates) {
if (error != null) {
return callback(error)
}
return callback(null, updates)
})
}
)
},
getSummarizedProjectUpdates(projectId, options, callback) {
if (options == null) {
options = {}
}
if (callback == null) {
callback = function () {}
}
if (!options.min_count) {
options.min_count = 25
}
let summarizedUpdates = []
const { before } = options
let nextBeforeTimestamp = null
return UpdatesManager.processUncompressedUpdatesForProject(
projectId,
2020-06-04 04:24:21 -04:00
function (error) {
if (error != null) {
return callback(error)
}
2021-07-13 07:04:43 -04:00
return PackManager.makeProjectIterator(
projectId,
2021-07-13 07:04:43 -04:00
before,
function (err, iterator) {
if (err != null) {
return callback(err)
}
// repeatedly get updates and pass them through the summariser to get an final output with user info
return async.whilst(
cb =>
2021-07-13 07:04:43 -04:00
// console.log "checking iterator.done", iterator.done()
cb(
null,
summarizedUpdates.length < options.min_count &&
!iterator.done()
),
2021-07-13 07:04:43 -04:00
cb =>
iterator.next(function (err, partialUpdates) {
2020-06-04 04:24:21 -04:00
if (err != null) {
return callback(err)
}
2021-07-13 07:04:43 -04:00
// logger.log {partialUpdates}, 'got partialUpdates'
if (partialUpdates.length === 0) {
return cb()
} // # FIXME should try to avoid this happening
nextBeforeTimestamp =
partialUpdates[partialUpdates.length - 1].meta.end_ts
// add the updates to the summary list
summarizedUpdates = UpdatesManager._summarizeUpdates(
partialUpdates,
summarizedUpdates
2020-06-04 04:24:21 -04:00
)
2021-07-13 07:04:43 -04:00
return cb()
}),
() =>
// finally done all updates
// console.log 'summarized Updates', summarizedUpdates
UpdatesManager.fillSummarizedUserInfo(
summarizedUpdates,
function (err, results) {
if (err != null) {
return callback(err)
}
return callback(
null,
results,
!iterator.done() ? nextBeforeTimestamp : undefined
)
}
)
)
}
)
}
)
},
exportProject(projectId, consumer) {
// Flush anything before collecting updates.
2021-07-13 07:04:43 -04:00
UpdatesManager.processUncompressedUpdatesForProject(projectId, err => {
if (err) return consumer(err)
// Fetch all the packs.
const before = undefined
PackManager.makeProjectIterator(projectId, before, (err, iterator) => {
if (err) return consumer(err)
const accumulatedUserIds = new Set()
async.whilst(
cb => cb(null, !iterator.done()),
2021-07-13 07:04:43 -04:00
cb =>
iterator.next((err, updatesFromASinglePack) => {
if (err) return cb(err)
if (updatesFromASinglePack.length === 0) {
// This should not happen when `iterator.done() == false`.
// Emitting an empty array would signal the consumer the final
// call.
return cb()
}
2021-07-13 07:04:43 -04:00
updatesFromASinglePack.forEach(update => {
accumulatedUserIds.add(
// Super defensive access on update details.
String(update && update.meta && update.meta.user_id)
)
})
// Emit updates and wait for the consumer.
consumer(null, { updates: updatesFromASinglePack }, cb)
}),
2021-07-13 07:04:43 -04:00
err => {
if (err) return consumer(err)
// Adding undefined can happen for broken updates.
accumulatedUserIds.delete('undefined')
consumer(null, {
updates: [],
2021-07-13 07:04:43 -04:00
userIds: Array.from(accumulatedUserIds).sort(),
})
}
)
})
})
},
fetchUserInfo(users, callback) {
if (callback == null) {
callback = function () {}
}
const jobs = []
const fetchedUserInfo = {}
for (const userId in users) {
;(userId =>
2021-07-13 07:04:43 -04:00
jobs.push(callback =>
WebApiManager.getUserInfo(userId, function (error, userInfo) {
if (error != null) {
return callback(error)
}
fetchedUserInfo[userId] = userInfo
return callback()
})
))(userId)
}
2020-06-04 04:24:21 -04:00
return async.series(jobs, function (err) {
if (err != null) {
return callback(err)
}
return callback(null, fetchedUserInfo)
})
},
fillUserInfo(updates, callback) {
let update, userId
if (callback == null) {
callback = function () {}
}
const users = {}
for (update of Array.from(updates)) {
;({ user_id: userId } = update.meta)
if (UpdatesManager._validUserId(userId)) {
users[userId] = true
}
}
2021-07-13 07:04:43 -04:00
return UpdatesManager.fetchUserInfo(
users,
function (error, fetchedUserInfo) {
if (error != null) {
return callback(error)
}
for (update of Array.from(updates)) {
;({ user_id: userId } = update.meta)
2021-07-13 07:04:43 -04:00
delete update.meta.user_id
if (UpdatesManager._validUserId(userId)) {
update.meta.user = fetchedUserInfo[userId]
2021-07-13 07:04:43 -04:00
}
}
2021-07-13 07:04:43 -04:00
return callback(null, updates)
}
2021-07-13 07:04:43 -04:00
)
},
fillSummarizedUserInfo(updates, callback) {
let update, userId, userIds
if (callback == null) {
callback = function () {}
}
const users = {}
for (update of Array.from(updates)) {
userIds = update.meta.user_ids || []
for (userId of Array.from(userIds)) {
if (UpdatesManager._validUserId(userId)) {
users[userId] = true
}
}
}
2021-07-13 07:04:43 -04:00
return UpdatesManager.fetchUserInfo(
users,
function (error, fetchedUserInfo) {
if (error != null) {
return callback(error)
}
for (update of Array.from(updates)) {
userIds = update.meta.user_ids || []
2021-07-13 07:04:43 -04:00
update.meta.users = []
delete update.meta.user_ids
for (userId of Array.from(userIds)) {
if (UpdatesManager._validUserId(userId)) {
update.meta.users.push(fetchedUserInfo[userId])
2021-07-13 07:04:43 -04:00
} else {
update.meta.users.push(null)
}
}
}
2021-07-13 07:04:43 -04:00
return callback(null, updates)
}
2021-07-13 07:04:43 -04:00
)
},
_validUserId(userId) {
if (userId == null) {
return false
} else {
return !!userId.match(/^[a-f0-9]{24}$/)
}
},
TIME_BETWEEN_DISTINCT_UPDATES: (fiveMinutes = 5 * 60 * 1000),
SPLIT_ON_DELETE_SIZE: 16, // characters
_summarizeUpdates(updates, existingSummarizedUpdates) {
if (existingSummarizedUpdates == null) {
existingSummarizedUpdates = []
}
const summarizedUpdates = existingSummarizedUpdates.slice()
let previousUpdateWasBigDelete = false
for (const update of Array.from(updates)) {
let docId
const earliestUpdate = summarizedUpdates[summarizedUpdates.length - 1]
let shouldConcat = false
// If a user inserts some text, then deletes a big chunk including that text,
// the update we show might concat the insert and delete, and there will be no sign
// of that insert having happened, or be able to restore to it (restoring after a big delete is common).
// So, we split the summary on 'big' deletes. However, we've stepping backwards in time with
// most recent changes considered first, so if this update is a big delete, we want to start
// a new summarized update next timge, hence we monitor the previous update.
if (previousUpdateWasBigDelete) {
shouldConcat = false
} else if (
earliestUpdate &&
earliestUpdate.meta.end_ts - update.meta.start_ts <
this.TIME_BETWEEN_DISTINCT_UPDATES
) {
// We're going backwards in time through the updates, so only combine if this update starts less than 5 minutes before
// the end of current summarized block, so no block spans more than 5 minutes.
shouldConcat = true
}
let isBigDelete = false
for (const op of Array.from(update.op || [])) {
if (op.d != null && op.d.length > this.SPLIT_ON_DELETE_SIZE) {
isBigDelete = true
}
}
previousUpdateWasBigDelete = isBigDelete
if (shouldConcat) {
// check if the user in this update is already present in the earliest update,
// if not, add them to the users list of the earliest update
earliestUpdate.meta.user_ids = _.union(earliestUpdate.meta.user_ids, [
2021-07-13 07:04:43 -04:00
update.meta.user_id,
])
docId = update.doc_id.toString()
const doc = earliestUpdate.docs[docId]
if (doc != null) {
doc.fromV = Math.min(doc.fromV, update.v)
doc.toV = Math.max(doc.toV, update.v)
} else {
earliestUpdate.docs[docId] = {
fromV: update.v,
2021-07-13 07:04:43 -04:00
toV: update.v,
}
}
earliestUpdate.meta.start_ts = Math.min(
earliestUpdate.meta.start_ts,
update.meta.start_ts
)
earliestUpdate.meta.end_ts = Math.max(
earliestUpdate.meta.end_ts,
update.meta.end_ts
)
} else {
const newUpdate = {
meta: {
user_ids: [],
start_ts: update.meta.start_ts,
2021-07-13 07:04:43 -04:00
end_ts: update.meta.end_ts,
},
2021-07-13 07:04:43 -04:00
docs: {},
}
newUpdate.docs[update.doc_id.toString()] = {
fromV: update.v,
2021-07-13 07:04:43 -04:00
toV: update.v,
}
newUpdate.meta.user_ids.push(update.meta.user_id)
summarizedUpdates.push(newUpdate)
}
}
return summarizedUpdates
2021-07-13 07:04:43 -04:00
},
}
module.exports.promises = {
processUncompressedUpdatesForProject: util.promisify(
UpdatesManager.processUncompressedUpdatesForProject
),
}
function __guard__(value, transform) {
return typeof value !== 'undefined' && value !== null
? transform(value)
: undefined
}