overleaf/services/track-changes/app/js/UpdatesManager.js

896 lines
27 KiB
JavaScript
Raw Normal View History

/* eslint-disable
camelcase,
handle-callback-err,
no-unused-vars,
*/
// TODO: This file was created by bulk-decaffeinate.
// Fix any style issues and re-enable lint.
/*
* decaffeinate suggestions:
* DS101: Remove unnecessary use of Array.from
* DS102: Remove unnecessary code created because of implicit returns
* DS103: Rewrite code to no longer use __guard__
* DS205: Consider reworking code to avoid use of IIFEs
* DS207: Consider shorter variations of null checks
* Full docs: https://github.com/decaffeinate/decaffeinate/blob/master/docs/suggestions.md
*/
let fiveMinutes, UpdatesManager
const MongoManager = require('./MongoManager')
const PackManager = require('./PackManager')
const RedisManager = require('./RedisManager')
const UpdateCompressor = require('./UpdateCompressor')
const LockManager = require('./LockManager')
const WebApiManager = require('./WebApiManager')
const UpdateTrimmer = require('./UpdateTrimmer')
const logger = require('logger-sharelatex')
const async = require('async')
const _ = require('underscore')
const Settings = require('@overleaf/settings')
const keys = Settings.redis.lock.key_schema
const util = require('util')
module.exports = UpdatesManager = {
compressAndSaveRawUpdates(
project_id,
doc_id,
rawUpdates,
temporary,
callback
) {
let i
if (callback == null) {
2020-06-04 04:24:21 -04:00
callback = function (error) {}
}
const { length } = rawUpdates
if (length === 0) {
return callback()
}
// check that ops are in the correct order
for (i = 0; i < rawUpdates.length; i++) {
const op = rawUpdates[i]
if (i > 0) {
const thisVersion = op != null ? op.v : undefined
2021-07-13 07:04:43 -04:00
const prevVersion = __guard__(rawUpdates[i - 1], x => x.v)
if (!(prevVersion < thisVersion)) {
logger.error(
{
project_id,
doc_id,
rawUpdates,
temporary,
thisVersion,
2021-07-13 07:04:43 -04:00
prevVersion,
},
'op versions out of order'
)
}
}
}
// FIXME: we no longer need the lastCompressedUpdate, so change functions not to need it
// CORRECTION: we do use it to log the time in case of error
2021-07-13 07:04:43 -04:00
return MongoManager.peekLastCompressedUpdate(
doc_id,
function (error, lastCompressedUpdate, lastVersion) {
// lastCompressedUpdate is the most recent update in Mongo, and
// lastVersion is its sharejs version number.
//
// The peekLastCompressedUpdate method may pass the update back
// as 'null' (for example if the previous compressed update has
// been archived). In this case it can still pass back the
// lastVersion from the update to allow us to check consistency.
let op
if (error != null) {
return callback(error)
}
2021-07-13 07:04:43 -04:00
// Ensure that raw updates start where lastVersion left off
if (lastVersion != null) {
const discardedUpdates = []
rawUpdates = rawUpdates.slice(0)
while (rawUpdates[0] != null && rawUpdates[0].v <= lastVersion) {
discardedUpdates.push(rawUpdates.shift())
}
2021-07-13 07:04:43 -04:00
if (discardedUpdates.length) {
logger.error(
{ project_id, doc_id, discardedUpdates, temporary, lastVersion },
'discarded updates already present'
)
}
2021-07-13 07:04:43 -04:00
if (rawUpdates[0] != null && rawUpdates[0].v !== lastVersion + 1) {
const ts = __guard__(
lastCompressedUpdate != null
? lastCompressedUpdate.meta
: undefined,
x1 => x1.end_ts
)
const last_timestamp = ts != null ? new Date(ts) : 'unknown time'
error = new Error(
`Tried to apply raw op at version ${rawUpdates[0].v} to last compressed update with version ${lastVersion} from ${last_timestamp}`
)
logger.error(
{
2021-07-13 07:04:43 -04:00
err: error,
doc_id,
2021-07-13 07:04:43 -04:00
project_id,
prev_end_ts: ts,
temporary,
lastCompressedUpdate,
},
2021-07-13 07:04:43 -04:00
'inconsistent doc versions'
)
2021-07-13 07:04:43 -04:00
if (
(Settings.trackchanges != null
? Settings.trackchanges.continueOnError
: undefined) &&
rawUpdates[0].v > lastVersion + 1
) {
// we have lost some ops - continue to write into the database, we can't recover at this point
lastCompressedUpdate = null
} else {
return callback(error)
}
}
2021-07-13 07:04:43 -04:00
}
if (rawUpdates.length === 0) {
return callback()
}
2021-07-13 07:04:43 -04:00
// some old large ops in redis need to be rejected, they predate
// the size limit that now prevents them going through the system
const REJECT_LARGE_OP_SIZE = 4 * 1024 * 1024
for (var rawUpdate of Array.from(rawUpdates)) {
const opSizes = (() => {
const result = []
for (op of Array.from(
(rawUpdate != null ? rawUpdate.op : undefined) || []
)) {
result.push(
(op.i != null ? op.i.length : undefined) ||
(op.d != null ? op.d.length : undefined)
)
}
return result
})()
const size = _.max(opSizes)
if (size > REJECT_LARGE_OP_SIZE) {
error = new Error(
`dropped op exceeding maximum allowed size of ${REJECT_LARGE_OP_SIZE}`
)
logger.error(
{ err: error, doc_id, project_id, size, rawUpdate },
'dropped op - too big'
)
rawUpdate.op = []
}
}
const compressedUpdates = UpdateCompressor.compressRawUpdates(
null,
rawUpdates
)
return PackManager.insertCompressedUpdates(
project_id,
doc_id,
lastCompressedUpdate,
compressedUpdates,
temporary,
function (error, result) {
if (error != null) {
return callback(error)
}
if (result != null) {
logger.log(
{
project_id,
doc_id,
orig_v:
lastCompressedUpdate != null
? lastCompressedUpdate.v
: undefined,
new_v: result.v,
},
'inserted updates into pack'
)
}
return callback()
}
)
}
)
},
// Check whether the updates are temporary (per-project property)
_prepareProjectForUpdates(project_id, callback) {
if (callback == null) {
2020-06-04 04:24:21 -04:00
callback = function (error, temporary) {}
}
2021-07-13 07:04:43 -04:00
return UpdateTrimmer.shouldTrimUpdates(
project_id,
function (error, temporary) {
if (error != null) {
return callback(error)
}
return callback(null, temporary)
}
2021-07-13 07:04:43 -04:00
)
},
// Check for project id on document history (per-document property)
_prepareDocForUpdates(project_id, doc_id, callback) {
if (callback == null) {
2020-06-04 04:24:21 -04:00
callback = function (error) {}
}
2020-06-04 04:24:21 -04:00
return MongoManager.backportProjectId(project_id, doc_id, function (error) {
if (error != null) {
return callback(error)
}
return callback(null)
})
},
// Apply updates for specific project/doc after preparing at project and doc level
REDIS_READ_BATCH_SIZE: 100,
processUncompressedUpdates(project_id, doc_id, temporary, callback) {
// get the updates as strings from redis (so we can delete them after they are applied)
if (callback == null) {
2020-06-04 04:24:21 -04:00
callback = function (error) {}
}
return RedisManager.getOldestDocUpdates(
doc_id,
UpdatesManager.REDIS_READ_BATCH_SIZE,
2020-06-04 04:24:21 -04:00
function (error, docUpdates) {
if (error != null) {
return callback(error)
}
const { length } = docUpdates
// parse the redis strings into ShareJs updates
2021-07-13 07:04:43 -04:00
return RedisManager.expandDocUpdates(
docUpdates,
function (error, rawUpdates) {
if (error != null) {
logger.err(
{ project_id, doc_id, docUpdates },
'failed to parse docUpdates'
)
2021-07-13 07:04:43 -04:00
return callback(error)
}
2021-07-13 07:04:43 -04:00
logger.log(
{ project_id, doc_id, rawUpdates },
'retrieved raw updates from redis'
)
return UpdatesManager.compressAndSaveRawUpdates(
project_id,
doc_id,
rawUpdates,
temporary,
function (error) {
if (error != null) {
return callback(error)
}
logger.log(
{ project_id, doc_id },
'compressed and saved doc updates'
)
// delete the applied updates from redis
return RedisManager.deleteAppliedDocUpdates(
project_id,
doc_id,
docUpdates,
function (error) {
if (error != null) {
return callback(error)
}
if (length === UpdatesManager.REDIS_READ_BATCH_SIZE) {
// There might be more updates
logger.log(
{ project_id, doc_id },
'continuing processing updates'
)
return setTimeout(
() =>
UpdatesManager.processUncompressedUpdates(
project_id,
doc_id,
temporary,
callback
),
0
)
} else {
logger.log(
{ project_id, doc_id },
'all raw updates processed'
)
return callback()
}
}
)
}
)
}
)
}
)
},
// Process updates for a doc when we flush it individually
processUncompressedUpdatesWithLock(project_id, doc_id, callback) {
if (callback == null) {
2020-06-04 04:24:21 -04:00
callback = function (error) {}
}
2021-07-13 07:04:43 -04:00
return UpdatesManager._prepareProjectForUpdates(
project_id,
function (error, temporary) {
if (error != null) {
return callback(error)
}
return UpdatesManager._processUncompressedUpdatesForDocWithLock(
project_id,
doc_id,
temporary,
callback
)
}
2021-07-13 07:04:43 -04:00
)
},
// Process updates for a doc when the whole project is flushed (internal method)
_processUncompressedUpdatesForDocWithLock(
project_id,
doc_id,
temporary,
callback
) {
if (callback == null) {
2020-06-04 04:24:21 -04:00
callback = function (error) {}
}
2021-07-13 07:04:43 -04:00
return UpdatesManager._prepareDocForUpdates(
project_id,
doc_id,
function (error) {
if (error != null) {
return callback(error)
}
return LockManager.runWithLock(
keys.historyLock({ doc_id }),
releaseLock =>
UpdatesManager.processUncompressedUpdates(
project_id,
doc_id,
temporary,
releaseLock
),
callback
)
}
2021-07-13 07:04:43 -04:00
)
},
// Process all updates for a project, only check project-level information once
processUncompressedUpdatesForProject(project_id, callback) {
if (callback == null) {
2020-06-04 04:24:21 -04:00
callback = function (error) {}
}
2021-07-13 07:04:43 -04:00
return RedisManager.getDocIdsWithHistoryOps(
project_id,
function (error, doc_ids) {
if (error != null) {
return callback(error)
}
2021-07-13 07:04:43 -04:00
return UpdatesManager._prepareProjectForUpdates(
project_id,
function (error, temporary) {
const jobs = []
for (const doc_id of Array.from(doc_ids)) {
;(doc_id =>
jobs.push(cb =>
UpdatesManager._processUncompressedUpdatesForDocWithLock(
project_id,
doc_id,
temporary,
cb
)
))(doc_id)
}
return async.parallelLimit(jobs, 5, callback)
}
)
}
)
},
// flush all outstanding changes
flushAll(limit, callback) {
if (callback == null) {
2020-06-04 04:24:21 -04:00
callback = function (error, result) {}
}
2020-06-04 04:24:21 -04:00
return RedisManager.getProjectIdsWithHistoryOps(function (
error,
project_ids
) {
let project_id
if (error != null) {
return callback(error)
}
logger.log(
{
count: project_ids != null ? project_ids.length : undefined,
2021-07-13 07:04:43 -04:00
project_ids,
},
'found projects'
)
const jobs = []
project_ids = _.shuffle(project_ids) // randomise to avoid hitting same projects each time
const selectedProjects =
limit < 0 ? project_ids : project_ids.slice(0, limit)
for (project_id of Array.from(selectedProjects)) {
2021-07-13 07:04:43 -04:00
;(project_id =>
jobs.push(cb =>
UpdatesManager.processUncompressedUpdatesForProject(
project_id,
2021-07-13 07:04:43 -04:00
err => cb(null, { failed: err != null, project_id })
)
))(project_id)
}
2020-06-04 04:24:21 -04:00
return async.series(jobs, function (error, result) {
let x
if (error != null) {
return callback(error)
}
const failedProjects = (() => {
const result1 = []
for (x of Array.from(result)) {
if (x.failed) {
result1.push(x.project_id)
}
}
return result1
})()
const succeededProjects = (() => {
const result2 = []
for (x of Array.from(result)) {
if (!x.failed) {
result2.push(x.project_id)
}
}
return result2
})()
return callback(null, {
failed: failedProjects,
succeeded: succeededProjects,
2021-07-13 07:04:43 -04:00
all: project_ids,
})
})
})
},
getDanglingUpdates(callback) {
if (callback == null) {
2020-06-04 04:24:21 -04:00
callback = function (error, doc_ids) {}
}
2020-06-04 04:24:21 -04:00
return RedisManager.getAllDocIdsWithHistoryOps(function (
error,
all_doc_ids
) {
if (error != null) {
return callback(error)
}
2020-06-04 04:24:21 -04:00
return RedisManager.getProjectIdsWithHistoryOps(function (
error,
all_project_ids
) {
if (error != null) {
return callback(error)
}
// function to get doc_ids for each project
2021-07-13 07:04:43 -04:00
const task = cb =>
async.concatSeries(
all_project_ids,
RedisManager.getDocIdsWithHistoryOps,
cb
)
// find the dangling doc ids
2020-06-04 04:24:21 -04:00
return task(function (error, project_doc_ids) {
const dangling_doc_ids = _.difference(all_doc_ids, project_doc_ids)
logger.log(
{ all_doc_ids, all_project_ids, project_doc_ids, dangling_doc_ids },
'checking for dangling doc ids'
)
return callback(null, dangling_doc_ids)
})
})
})
},
getDocUpdates(project_id, doc_id, options, callback) {
if (options == null) {
options = {}
}
if (callback == null) {
2020-06-04 04:24:21 -04:00
callback = function (error, updates) {}
}
return UpdatesManager.processUncompressedUpdatesWithLock(
project_id,
doc_id,
2020-06-04 04:24:21 -04:00
function (error) {
if (error != null) {
return callback(error)
}
// console.log "options", options
return PackManager.getOpsByVersionRange(
project_id,
doc_id,
options.from,
options.to,
2020-06-04 04:24:21 -04:00
function (error, updates) {
if (error != null) {
return callback(error)
}
return callback(null, updates)
}
)
}
)
},
getDocUpdatesWithUserInfo(project_id, doc_id, options, callback) {
if (options == null) {
options = {}
}
if (callback == null) {
2020-06-04 04:24:21 -04:00
callback = function (error, updates) {}
}
2021-07-13 07:04:43 -04:00
return UpdatesManager.getDocUpdates(
project_id,
doc_id,
options,
function (error, updates) {
if (error != null) {
return callback(error)
}
2021-07-13 07:04:43 -04:00
return UpdatesManager.fillUserInfo(updates, function (error, updates) {
if (error != null) {
return callback(error)
}
return callback(null, updates)
})
}
)
},
getSummarizedProjectUpdates(project_id, options, callback) {
if (options == null) {
options = {}
}
if (callback == null) {
2020-06-04 04:24:21 -04:00
callback = function (error, updates) {}
}
if (!options.min_count) {
options.min_count = 25
}
let summarizedUpdates = []
const { before } = options
let nextBeforeTimestamp = null
return UpdatesManager.processUncompressedUpdatesForProject(
project_id,
2020-06-04 04:24:21 -04:00
function (error) {
if (error != null) {
return callback(error)
}
2021-07-13 07:04:43 -04:00
return PackManager.makeProjectIterator(
project_id,
before,
function (err, iterator) {
if (err != null) {
return callback(err)
}
// repeatedly get updates and pass them through the summariser to get an final output with user info
return async.whilst(
() =>
// console.log "checking iterator.done", iterator.done()
summarizedUpdates.length < options.min_count &&
!iterator.done(),
cb =>
iterator.next(function (err, partialUpdates) {
2020-06-04 04:24:21 -04:00
if (err != null) {
return callback(err)
}
2021-07-13 07:04:43 -04:00
// logger.log {partialUpdates}, 'got partialUpdates'
if (partialUpdates.length === 0) {
return cb()
} // # FIXME should try to avoid this happening
nextBeforeTimestamp =
partialUpdates[partialUpdates.length - 1].meta.end_ts
// add the updates to the summary list
summarizedUpdates = UpdatesManager._summarizeUpdates(
partialUpdates,
summarizedUpdates
2020-06-04 04:24:21 -04:00
)
2021-07-13 07:04:43 -04:00
return cb()
}),
() =>
// finally done all updates
// console.log 'summarized Updates', summarizedUpdates
UpdatesManager.fillSummarizedUserInfo(
summarizedUpdates,
function (err, results) {
if (err != null) {
return callback(err)
}
return callback(
null,
results,
!iterator.done() ? nextBeforeTimestamp : undefined
)
}
)
)
}
)
}
)
},
exportProject(projectId, consumer) {
// Flush anything before collecting updates.
2021-07-13 07:04:43 -04:00
UpdatesManager.processUncompressedUpdatesForProject(projectId, err => {
if (err) return consumer(err)
// Fetch all the packs.
const before = undefined
PackManager.makeProjectIterator(projectId, before, (err, iterator) => {
if (err) return consumer(err)
const accumulatedUserIds = new Set()
async.whilst(
() => !iterator.done(),
2021-07-13 07:04:43 -04:00
cb =>
iterator.next((err, updatesFromASinglePack) => {
if (err) return cb(err)
if (updatesFromASinglePack.length === 0) {
// This should not happen when `iterator.done() == false`.
// Emitting an empty array would signal the consumer the final
// call.
return cb()
}
2021-07-13 07:04:43 -04:00
updatesFromASinglePack.forEach(update => {
accumulatedUserIds.add(
// Super defensive access on update details.
String(update && update.meta && update.meta.user_id)
)
})
// Emit updates and wait for the consumer.
consumer(null, { updates: updatesFromASinglePack }, cb)
}),
2021-07-13 07:04:43 -04:00
err => {
if (err) return consumer(err)
// Adding undefined can happen for broken updates.
accumulatedUserIds.delete('undefined')
consumer(null, {
updates: [],
2021-07-13 07:04:43 -04:00
userIds: Array.from(accumulatedUserIds).sort(),
})
}
)
})
})
},
fetchUserInfo(users, callback) {
if (callback == null) {
2020-06-04 04:24:21 -04:00
callback = function (error, fetchedUserInfo) {}
}
const jobs = []
const fetchedUserInfo = {}
for (const user_id in users) {
2021-07-13 07:04:43 -04:00
;(user_id =>
jobs.push(callback =>
2020-06-04 04:24:21 -04:00
WebApiManager.getUserInfo(user_id, function (error, userInfo) {
if (error != null) {
return callback(error)
}
fetchedUserInfo[user_id] = userInfo
return callback()
})
))(user_id)
}
2020-06-04 04:24:21 -04:00
return async.series(jobs, function (err) {
if (err != null) {
return callback(err)
}
return callback(null, fetchedUserInfo)
})
},
fillUserInfo(updates, callback) {
let update, user_id
if (callback == null) {
2020-06-04 04:24:21 -04:00
callback = function (error, updates) {}
}
const users = {}
for (update of Array.from(updates)) {
;({ user_id } = update.meta)
if (UpdatesManager._validUserId(user_id)) {
users[user_id] = true
}
}
2021-07-13 07:04:43 -04:00
return UpdatesManager.fetchUserInfo(
users,
function (error, fetchedUserInfo) {
if (error != null) {
return callback(error)
}
for (update of Array.from(updates)) {
;({ user_id } = update.meta)
delete update.meta.user_id
if (UpdatesManager._validUserId(user_id)) {
update.meta.user = fetchedUserInfo[user_id]
}
}
2021-07-13 07:04:43 -04:00
return callback(null, updates)
}
2021-07-13 07:04:43 -04:00
)
},
fillSummarizedUserInfo(updates, callback) {
let update, user_id, user_ids
if (callback == null) {
2020-06-04 04:24:21 -04:00
callback = function (error, updates) {}
}
const users = {}
for (update of Array.from(updates)) {
user_ids = update.meta.user_ids || []
for (user_id of Array.from(user_ids)) {
if (UpdatesManager._validUserId(user_id)) {
users[user_id] = true
}
}
}
2021-07-13 07:04:43 -04:00
return UpdatesManager.fetchUserInfo(
users,
function (error, fetchedUserInfo) {
if (error != null) {
return callback(error)
}
for (update of Array.from(updates)) {
user_ids = update.meta.user_ids || []
update.meta.users = []
delete update.meta.user_ids
for (user_id of Array.from(user_ids)) {
if (UpdatesManager._validUserId(user_id)) {
update.meta.users.push(fetchedUserInfo[user_id])
} else {
update.meta.users.push(null)
}
}
}
2021-07-13 07:04:43 -04:00
return callback(null, updates)
}
2021-07-13 07:04:43 -04:00
)
},
_validUserId(user_id) {
if (user_id == null) {
return false
} else {
return !!user_id.match(/^[a-f0-9]{24}$/)
}
},
TIME_BETWEEN_DISTINCT_UPDATES: (fiveMinutes = 5 * 60 * 1000),
SPLIT_ON_DELETE_SIZE: 16, // characters
_summarizeUpdates(updates, existingSummarizedUpdates) {
if (existingSummarizedUpdates == null) {
existingSummarizedUpdates = []
}
const summarizedUpdates = existingSummarizedUpdates.slice()
let previousUpdateWasBigDelete = false
for (const update of Array.from(updates)) {
var doc_id
const earliestUpdate = summarizedUpdates[summarizedUpdates.length - 1]
let shouldConcat = false
// If a user inserts some text, then deletes a big chunk including that text,
// the update we show might concat the insert and delete, and there will be no sign
// of that insert having happened, or be able to restore to it (restoring after a big delete is common).
// So, we split the summary on 'big' deletes. However, we've stepping backwards in time with
// most recent changes considered first, so if this update is a big delete, we want to start
// a new summarized update next timge, hence we monitor the previous update.
if (previousUpdateWasBigDelete) {
shouldConcat = false
} else if (
earliestUpdate &&
earliestUpdate.meta.end_ts - update.meta.start_ts <
this.TIME_BETWEEN_DISTINCT_UPDATES
) {
// We're going backwards in time through the updates, so only combine if this update starts less than 5 minutes before
// the end of current summarized block, so no block spans more than 5 minutes.
shouldConcat = true
}
let isBigDelete = false
for (const op of Array.from(update.op || [])) {
if (op.d != null && op.d.length > this.SPLIT_ON_DELETE_SIZE) {
isBigDelete = true
}
}
previousUpdateWasBigDelete = isBigDelete
if (shouldConcat) {
// check if the user in this update is already present in the earliest update,
// if not, add them to the users list of the earliest update
earliestUpdate.meta.user_ids = _.union(earliestUpdate.meta.user_ids, [
2021-07-13 07:04:43 -04:00
update.meta.user_id,
])
doc_id = update.doc_id.toString()
const doc = earliestUpdate.docs[doc_id]
if (doc != null) {
doc.fromV = Math.min(doc.fromV, update.v)
doc.toV = Math.max(doc.toV, update.v)
} else {
earliestUpdate.docs[doc_id] = {
fromV: update.v,
2021-07-13 07:04:43 -04:00
toV: update.v,
}
}
earliestUpdate.meta.start_ts = Math.min(
earliestUpdate.meta.start_ts,
update.meta.start_ts
)
earliestUpdate.meta.end_ts = Math.max(
earliestUpdate.meta.end_ts,
update.meta.end_ts
)
} else {
const newUpdate = {
meta: {
user_ids: [],
start_ts: update.meta.start_ts,
2021-07-13 07:04:43 -04:00
end_ts: update.meta.end_ts,
},
2021-07-13 07:04:43 -04:00
docs: {},
}
newUpdate.docs[update.doc_id.toString()] = {
fromV: update.v,
2021-07-13 07:04:43 -04:00
toV: update.v,
}
newUpdate.meta.user_ids.push(update.meta.user_id)
summarizedUpdates.push(newUpdate)
}
}
return summarizedUpdates
2021-07-13 07:04:43 -04:00
},
}
module.exports.promises = {
processUncompressedUpdatesForProject: util.promisify(
UpdatesManager.processUncompressedUpdatesForProject
),
}
function __guard__(value, transform) {
return typeof value !== 'undefined' && value !== null
? transform(value)
: undefined
}