mirror of
https://github.com/overleaf/overleaf.git
synced 2025-01-01 18:02:30 +00:00
532 lines
18 KiB
CoffeeScript
532 lines
18 KiB
CoffeeScript
async = require "async"
|
|
_ = require "underscore"
|
|
{db, ObjectId, BSON} = require "./mongojs"
|
|
logger = require "logger-sharelatex"
|
|
LockManager = require "./LockManager"
|
|
|
|
DAYS = 24 * 3600 * 1000 # one day in milliseconds
|
|
|
|
module.exports = PackManager =
|
|
# The following functions implement methods like a mongo find, but
|
|
# expands any documents containing a 'pack' field into multiple
|
|
# values
|
|
#
|
|
# e.g. a single update looks like
|
|
#
|
|
# {
|
|
# "doc_id" : 549dae9e0a2a615c0c7f0c98,
|
|
# "project_id" : 549dae9c0a2a615c0c7f0c8c,
|
|
# "op" : [ {"p" : 6981, "d" : "?" } ],
|
|
# "meta" : { "user_id" : 52933..., "start_ts" : 1422310693931, "end_ts" : 1422310693931 },
|
|
# "v" : 17082
|
|
# }
|
|
#
|
|
# and a pack looks like this
|
|
#
|
|
# {
|
|
# "doc_id" : 549dae9e0a2a615c0c7f0c98,
|
|
# "project_id" : 549dae9c0a2a615c0c7f0c8c,
|
|
# "pack" : [ U1, U2, U3, ...., UN],
|
|
# "meta" : { "user_id" : 52933..., "start_ts" : 1422310693931, "end_ts" : 1422310693931 },
|
|
# "v" : 17082
|
|
# }
|
|
#
|
|
# where U1, U2, U3, .... are single updates stripped of their
|
|
# doc_id and project_id fields (which are the same for all the
|
|
# updates in the pack).
|
|
#
|
|
# The pack itself has v and meta fields, this makes it possible to
|
|
# treat packs and single updates in the same way.
|
|
#
|
|
# The v field of the pack itself is from the first entry U1
|
|
# The meta.end_ts field of the pack itself is from the last entry UN.
|
|
|
|
findDocResults: (collection, query, limit, callback) ->
|
|
# query - the mongo query selector, includes both the doc_id/project_id and
|
|
# the range on v
|
|
# limit - the mongo limit, we need to apply it after unpacking any
|
|
# packs
|
|
|
|
sort = {}
|
|
sort['v'] = -1;
|
|
cursor = collection
|
|
.find( query )
|
|
.sort( sort )
|
|
# if we have packs, we will trim the results more later after expanding them
|
|
if limit?
|
|
cursor.limit(limit)
|
|
|
|
# take the part of the query which selects the range over the parameter
|
|
rangeQuery = query['v']
|
|
|
|
# helper function to check if an item from a pack is inside the
|
|
# desired range
|
|
filterFn = (item) ->
|
|
return false if rangeQuery?['$gte']? && item['v'] < rangeQuery['$gte']
|
|
return false if rangeQuery?['$lte']? && item['v'] > rangeQuery['$lte']
|
|
return false if rangeQuery?['$lt']? && item['v'] >= rangeQuery['$lt']
|
|
return false if rangeQuery?['$gt']? && item['v'] <= rangeQuery['$gt']
|
|
return true
|
|
|
|
versionOrder = (a, b) ->
|
|
b.v - a.v
|
|
|
|
# create a query which can be used to select the entries BEFORE
|
|
# the range because we sometimes need to find extra ones (when the
|
|
# boundary falls in the middle of a pack)
|
|
extraQuery = _.clone(query)
|
|
# The pack uses its first entry for its metadata and v, so the
|
|
# only queries where we might not get all the packs are those for
|
|
# $gt and $gte (i.e. we need to find packs which start before our
|
|
# range but end in it)
|
|
if rangeQuery?['$gte']?
|
|
extraQuery['v'] = {'$lt' : rangeQuery['$gte']}
|
|
else if rangeQuery?['$gt']
|
|
extraQuery['v'] = {'$lte' : rangeQuery['$gt']}
|
|
else
|
|
delete extraQuery['v']
|
|
|
|
needMore = false # keep track of whether we need to load more data
|
|
updates = [] # used to accumulate the set of results
|
|
|
|
# FIXME: packs are big so we should accumulate the results
|
|
# incrementally instead of using .toArray() to avoid reading all
|
|
# of the changes into memory
|
|
cursor.toArray (err, result) ->
|
|
unpackedSet = PackManager._unpackResults(result)
|
|
updates = PackManager._filterAndLimit(updates, unpackedSet, filterFn, limit)
|
|
# check if we need to retrieve more data, because there is a
|
|
# pack that crosses into our range
|
|
last = if unpackedSet.length then unpackedSet[unpackedSet.length-1] else null
|
|
if limit? && updates.length == limit
|
|
needMore = false
|
|
else if extraQuery['v']? && last? && filterFn(last)
|
|
needMore = true
|
|
else if extraQuery['v']? && updates.length == 0
|
|
needMore = true
|
|
if needMore
|
|
# we do need an extra result set
|
|
extra = collection
|
|
.find(extraQuery)
|
|
.sort(sort)
|
|
.limit(1)
|
|
extra.toArray (err, result2) ->
|
|
if err?
|
|
return callback err, updates.sort versionOrder
|
|
else
|
|
extraSet = PackManager._unpackResults(result2)
|
|
updates = PackManager._filterAndLimit(updates, extraSet, filterFn, limit)
|
|
callback err, updates.sort versionOrder
|
|
return
|
|
if err?
|
|
callback err, result
|
|
else
|
|
callback err, updates.sort versionOrder
|
|
|
|
findProjectResults: (collection, query, limit, callback) ->
|
|
# query - the mongo query selector, includes both the doc_id/project_id and
|
|
# the range on meta.end_ts
|
|
# limit - the mongo limit, we need to apply it after unpacking any
|
|
# packs
|
|
|
|
sort = {}
|
|
sort['meta.end_ts'] = -1;
|
|
|
|
projection = {"op":false, "pack.op": false}
|
|
cursor = collection
|
|
.find( query, projection ) # no need to return the op only need version info
|
|
.sort( sort )
|
|
# if we have packs, we will trim the results more later after expanding them
|
|
if limit?
|
|
cursor.limit(limit)
|
|
|
|
# take the part of the query which selects the range over the parameter
|
|
before = query['meta.end_ts']?['$lt'] # may be null
|
|
|
|
updates = [] # used to accumulate the set of results
|
|
|
|
# FIXME: packs are big so we should accumulate the results
|
|
# incrementally instead of using .toArray() to avoid reading all
|
|
# of the changes into memory
|
|
cursor.toArray (err, result) ->
|
|
if err?
|
|
return callback err, result
|
|
if result.length == 0 && not before? # no results and no time range specified
|
|
return callback err, result
|
|
|
|
unpackedSet = PackManager._unpackResults(result)
|
|
if limit?
|
|
unpackedSet = unpackedSet.slice(0, limit)
|
|
# find the end time of the last result, we will take all the
|
|
# results up to this, and then all the changes at that time
|
|
# (without imposing a limit) and any overlapping packs
|
|
cutoff = if unpackedSet.length then unpackedSet[unpackedSet.length-1].meta.end_ts else null
|
|
|
|
filterFn = (item) ->
|
|
ts = item?.meta?.end_ts
|
|
return false if before? && ts >= before
|
|
return false if cutoff? && ts < cutoff
|
|
return true
|
|
|
|
timeOrder = (a, b) ->
|
|
(b.meta.end_ts - a.meta.end_ts) || documentOrder(a, b)
|
|
|
|
documentOrder = (a, b) ->
|
|
x = a.doc_id.valueOf()
|
|
y = b.doc_id.valueOf()
|
|
if x > y then 1 else if x < y then -1 else 0
|
|
|
|
updates = PackManager._filterAndLimit(updates, unpackedSet, filterFn, limit)
|
|
|
|
# get all elements on the lower bound (cutoff)
|
|
tailQuery = _.clone(query)
|
|
tailQuery['meta.end_ts'] = cutoff
|
|
tail = collection
|
|
.find(tailQuery, projection)
|
|
.sort(sort)
|
|
|
|
# now find any packs that overlap with the time window from outside
|
|
# cutoff before
|
|
# --|-----wanted-range--|------------------ time=>
|
|
# |-------------|pack(end_ts)
|
|
#
|
|
# these were not picked up by the original query because
|
|
# end_ts>before but the beginning of the pack may be in the time range
|
|
overlapQuery = _.clone(query)
|
|
if before? && cutoff?
|
|
overlapQuery['meta.end_ts'] = {"$gte": before}
|
|
overlapQuery['meta.start_ts'] = {"$lte": before }
|
|
else if before? && not cutoff?
|
|
overlapQuery['meta.end_ts'] = {"$gte": before}
|
|
overlapQuery['meta.start_ts'] = {"$lte": before }
|
|
else if not before? && cutoff?
|
|
overlapQuery['meta.end_ts'] = {"$gte": cutoff} # we already have these??
|
|
else if not before? && not cutoff?
|
|
overlapQuery['meta.end_ts'] = {"$gte": 0 } # shouldn't happen??
|
|
|
|
overlap = collection
|
|
.find(overlapQuery, projection)
|
|
.sort(sort)
|
|
|
|
# we don't specify a limit here, as there could be any number of overlaps
|
|
# NB. need to catch items in original query and followup query for duplicates
|
|
|
|
applyAndUpdate = (result) ->
|
|
extraSet = PackManager._unpackResults(result)
|
|
# note: final argument is null, no limit applied because we
|
|
# need all the updates at the final time to avoid breaking
|
|
# the changeset into parts
|
|
updates = PackManager._filterAndLimit(updates, extraSet, filterFn, null)
|
|
tail.toArray (err, result2) ->
|
|
if err?
|
|
return callback err, updates.sort timeOrder
|
|
else
|
|
applyAndUpdate result2
|
|
overlap.toArray (err, result3) ->
|
|
if err?
|
|
return callback err, updates.sort timeOrder
|
|
else
|
|
applyAndUpdate result3
|
|
callback err, updates.sort timeOrder
|
|
|
|
_unpackResults: (updates) ->
|
|
# iterate over the updates, if there's a pack, expand it into ops and
|
|
# insert it into the array at that point
|
|
result = []
|
|
updates.forEach (item) ->
|
|
if item.pack?
|
|
all = PackManager._explodePackToOps item
|
|
result = result.concat all
|
|
else
|
|
result.push item
|
|
return result
|
|
|
|
_explodePackToOps: (packObj) ->
|
|
# convert a pack into an array of ops
|
|
doc_id = packObj.doc_id
|
|
project_id = packObj.project_id
|
|
result = packObj.pack.map (item) ->
|
|
item.doc_id = doc_id
|
|
item.project_id = project_id
|
|
item
|
|
return result.reverse()
|
|
|
|
_filterAndLimit: (results, extra, filterFn, limit) ->
|
|
# update results with extra docs, after filtering and limiting
|
|
filtered = extra.filter(filterFn)
|
|
newResults = results.concat filtered
|
|
# remove duplicates
|
|
seen = {}
|
|
newResults = newResults.filter (item) ->
|
|
key = item.doc_id + ' ' + item.v
|
|
if seen[key]
|
|
return false
|
|
else
|
|
seen[key] = true
|
|
return true
|
|
newResults.slice(0, limit) if limit?
|
|
return newResults
|
|
|
|
MAX_SIZE: 1024*1024 # make these configurable parameters
|
|
MAX_COUNT: 512
|
|
|
|
convertDocsToPacks: (docs, callback) ->
|
|
packs = []
|
|
top = null
|
|
docs.forEach (d,i) ->
|
|
# skip existing packs
|
|
if d.pack?
|
|
top = null
|
|
return
|
|
# skip temporary ops (we could pack these into temporary packs in future)
|
|
if d.expiresAt?
|
|
top = null
|
|
return
|
|
sz = BSON.calculateObjectSize(d)
|
|
if top? && top.pack.length < PackManager.MAX_COUNT && top.sz + sz < PackManager.MAX_SIZE
|
|
top.pack = top.pack.concat {v: d.v, meta: d.meta, op: d.op, _id: d._id}
|
|
top.sz += sz
|
|
top.v_end = d.v
|
|
top.meta.end_ts = d.meta.end_ts
|
|
return
|
|
else
|
|
# create a new pack
|
|
top = _.clone(d)
|
|
top.pack = [ {v: d.v, meta: d.meta, op: d.op, _id: d._id} ]
|
|
top.meta = { start_ts: d.meta.start_ts, end_ts: d.meta.end_ts }
|
|
top.sz = sz
|
|
top.v_end = d.v
|
|
delete top.op
|
|
delete top._id
|
|
packs.push top
|
|
|
|
callback(null, packs)
|
|
|
|
checkHistory: (docs, callback) ->
|
|
errors = []
|
|
prev = null
|
|
error = (args...) ->
|
|
errors.push args
|
|
docs.forEach (d,i) ->
|
|
if d.pack?
|
|
n = d.pack.length
|
|
last = d.pack[n-1]
|
|
error('bad pack v_end', d) if d.v_end != last.v
|
|
error('bad pack start_ts', d) if d.meta.start_ts != d.pack[0].meta.start_ts
|
|
error('bad pack end_ts', d) if d.meta.end_ts != last.meta.end_ts
|
|
d.pack.forEach (p, i) ->
|
|
prev = v
|
|
v = p.v
|
|
error('bad version', v, 'in', p) if v <= prev
|
|
#error('expired op', p, 'in pack') if p.expiresAt?
|
|
else
|
|
prev = v
|
|
v = d.v
|
|
error('bad version', v, 'in', d) if v <= prev
|
|
if errors.length
|
|
callback(errors)
|
|
else
|
|
callback()
|
|
|
|
insertPack: (packObj, callback) ->
|
|
bulk = db.docHistory.initializeOrderedBulkOp()
|
|
doc_id = packObj.doc_id
|
|
expect_nInserted = 1
|
|
expect_nRemoved = packObj.pack.length
|
|
logger.log {doc_id: doc_id}, "adding pack, removing #{expect_nRemoved} ops"
|
|
bulk.insert packObj
|
|
ids = (op._id for op in packObj.pack)
|
|
bulk.find({_id:{$in:ids}}).remove()
|
|
bulk.execute (err, result) ->
|
|
if err?
|
|
logger.error {doc_id: doc_id}, "error adding pack"
|
|
callback(err, result)
|
|
else if result.nInserted != expect_nInserted or result.nRemoved != expect_nRemoved
|
|
logger.error {doc_id: doc_id, result}, "unexpected result adding pack"
|
|
callback(new Error(
|
|
msg: 'unexpected result'
|
|
expected: {expect_nInserted, expect_nRemoved}
|
|
), result)
|
|
else
|
|
db.docHistoryStats.update {doc_id:doc_id}, {
|
|
$inc:{update_count:-expect_nRemoved},
|
|
$currentDate:{last_packed:true}
|
|
}, {upsert:true}, () ->
|
|
callback(err, result)
|
|
|
|
# retrieve document ops/packs and check them
|
|
getDocHistory: (doc_id, callback) ->
|
|
db.docHistory.find({doc_id:ObjectId(doc_id)}).sort {v:1}, (err, docs) ->
|
|
return callback(err) if err?
|
|
# for safety, do a consistency check of the history
|
|
logger.log {doc_id}, "checking history for document"
|
|
PackManager.checkHistory docs, (err) ->
|
|
return callback(err) if err?
|
|
callback(err, docs)
|
|
#PackManager.deleteExpiredPackOps docs, (err) ->
|
|
# return callback(err) if err?
|
|
# callback err, docs
|
|
|
|
packDocHistory: (doc_id, options, callback) ->
|
|
if typeof callback == "undefined" and typeof options == 'function'
|
|
callback = options
|
|
options = {}
|
|
LockManager.runWithLock(
|
|
"HistoryLock:#{doc_id}",
|
|
(releaseLock) ->
|
|
PackManager._packDocHistory(doc_id, options, releaseLock)
|
|
, callback
|
|
)
|
|
|
|
_packDocHistory: (doc_id, options, callback) ->
|
|
logger.log {doc_id},"starting pack operation for document history"
|
|
|
|
PackManager.getDocHistory doc_id, (err, docs) ->
|
|
return callback(err) if err?
|
|
origDocs = 0
|
|
origPacks = 0
|
|
for d in docs
|
|
if d.pack? then origPacks++ else origDocs++
|
|
PackManager.convertDocsToPacks docs, (err, packs) ->
|
|
return callback(err) if err?
|
|
total = 0
|
|
for p in packs
|
|
total = total + p.pack.length
|
|
logger.log {doc_id, origDocs, origPacks, newPacks: packs.length, totalOps: total}, "document stats"
|
|
if packs.length
|
|
if options['dry-run']
|
|
logger.log {doc_id}, 'dry-run, skipping write packs'
|
|
return callback()
|
|
PackManager.savePacks packs, (err) ->
|
|
return callback(err) if err?
|
|
# check the history again
|
|
PackManager.getDocHistory doc_id, callback
|
|
else
|
|
logger.log {doc_id}, "no packs to write"
|
|
# keep a record that we checked this one to avoid rechecking it
|
|
db.docHistoryStats.update {doc_id:doc_id}, {
|
|
$currentDate:{last_checked:true}
|
|
}, {upsert:true}, () ->
|
|
callback null, null
|
|
|
|
DB_WRITE_DELAY: 100
|
|
|
|
savePacks: (packs, callback) ->
|
|
async.eachSeries packs, PackManager.safeInsert, (err, result) ->
|
|
if err?
|
|
logger.log {err, result}, "error writing packs"
|
|
callback err, result
|
|
else
|
|
callback()
|
|
|
|
safeInsert: (packObj, callback) ->
|
|
PackManager.insertPack packObj, (err, result) ->
|
|
setTimeout () ->
|
|
callback(err,result)
|
|
, PackManager.DB_WRITE_DELAY
|
|
|
|
deleteExpiredPackOps: (docs, callback) ->
|
|
now = Date.now()
|
|
toRemove = []
|
|
toUpdate = []
|
|
docs.forEach (d,i) ->
|
|
if d.pack?
|
|
newPack = d.pack.filter (op) ->
|
|
if op.expiresAt? then op.expiresAt > now else true
|
|
if newPack.length == 0
|
|
toRemove.push d
|
|
else if newPack.length < d.pack.length
|
|
# adjust the pack properties
|
|
d.pack = newPack
|
|
first = d.pack[0]
|
|
last = d.pack[d.pack.length - 1]
|
|
d.v_end = last.v
|
|
d.meta.start_ts = first.meta.start_ts
|
|
d.meta.end_ts = last.meta.end_ts
|
|
toUpdate.push d
|
|
if toRemove.length or toUpdate.length
|
|
bulk = db.docHistory.initializeOrderedBulkOp()
|
|
toRemove.forEach (pack) ->
|
|
console.log "would remove", pack
|
|
#bulk.find({_id:pack._id}).removeOne()
|
|
toUpdate.forEach (pack) ->
|
|
console.log "would update", pack
|
|
#bulk.find({_id:pack._id}).updateOne(pack);
|
|
bulk.execute callback
|
|
else
|
|
callback()
|
|
|
|
insertCompressedUpdates: (project_id, doc_id, lastUpdate, newUpdates, temporary, callback = (error) ->) ->
|
|
return callback() if newUpdates.length == 0
|
|
|
|
updatesToFlush = []
|
|
updatesRemaining = newUpdates.slice()
|
|
|
|
n = lastUpdate?.n || 0
|
|
sz = lastUpdate?.sz || 0
|
|
|
|
while updatesRemaining.length and n < PackManager.MAX_COUNT and sz < PackManager.MAX_SIZE
|
|
nextUpdate = updatesRemaining[0]
|
|
nextUpdateSize = BSON.calculateObjectSize(nextUpdate)
|
|
if nextUpdateSize + sz > PackManager.MAX_SIZE and n > 0
|
|
break
|
|
n++
|
|
sz += nextUpdateSize
|
|
updatesToFlush.push updatesRemaining.shift()
|
|
|
|
PackManager.flushCompressedUpdates project_id, doc_id, lastUpdate, updatesToFlush, temporary, (error) ->
|
|
return callback(error) if error?
|
|
PackManager.insertCompressedUpdates project_id, doc_id, null, updatesRemaining, temporary, callback
|
|
|
|
flushCompressedUpdates: (project_id, doc_id, lastUpdate, newUpdates, temporary, callback = (error) ->) ->
|
|
return callback() if newUpdates.length == 0
|
|
if lastUpdate? and not (temporary and ((Date.now() - lastUpdate.meta?.start_ts) > 1 * DAYS))
|
|
PackManager.appendUpdatesToExistingPack project_id, doc_id, lastUpdate, newUpdates, temporary, callback
|
|
else
|
|
PackManager.insertUpdatesIntoNewPack project_id, doc_id, newUpdates, temporary, callback
|
|
|
|
insertUpdatesIntoNewPack: (project_id, doc_id, newUpdates, temporary, callback = (error) ->) ->
|
|
first = newUpdates[0]
|
|
last = newUpdates[newUpdates.length - 1]
|
|
n = newUpdates.length
|
|
sz = BSON.calculateObjectSize(newUpdates)
|
|
newPack =
|
|
project_id: ObjectId(project_id.toString())
|
|
doc_id: ObjectId(doc_id.toString())
|
|
pack: newUpdates
|
|
n: n
|
|
sz: sz
|
|
meta:
|
|
start_ts: first.meta.start_ts
|
|
end_ts: last.meta.end_ts
|
|
v: first.v
|
|
v_end: last.v
|
|
if temporary
|
|
newPack.expiresAt = new Date(Date.now() + 7 * DAYS)
|
|
logger.log {project_id, doc_id, newUpdates}, "inserting updates into new pack"
|
|
db.docHistory.insert newPack, callback
|
|
|
|
appendUpdatesToExistingPack: (project_id, doc_id, lastUpdate, newUpdates, temporary, callback = (error) ->) ->
|
|
first = newUpdates[0]
|
|
last = newUpdates[newUpdates.length - 1]
|
|
n = newUpdates.length
|
|
sz = BSON.calculateObjectSize(newUpdates)
|
|
query =
|
|
_id: lastUpdate._id
|
|
project_id: ObjectId(project_id.toString())
|
|
doc_id: ObjectId(doc_id.toString())
|
|
pack: {$exists: true}
|
|
update =
|
|
$push:
|
|
"pack": {$each: newUpdates}
|
|
$inc:
|
|
"n": n
|
|
"sz": sz
|
|
$set:
|
|
"meta.end_ts": last.meta.end_ts
|
|
"v_end": last.v
|
|
if lastUpdate.expiresAt and temporary
|
|
update.$set.expiresAt = new Date(Date.now() + 7 * DAYS)
|
|
logger.log {project_id, doc_id, lastUpdate, newUpdates}, "appending updates to existing pack"
|
|
db.docHistory.findAndModify {query, update}, callback
|
|
|