async = require "async" _ = require "underscore" {db, ObjectId, BSON} = require "./mongojs" logger = require "logger-sharelatex" LockManager = require "./LockManager" DAYS = 24 * 3600 * 1000 # one day in milliseconds module.exports = PackManager = # The following functions implement methods like a mongo find, but # expands any documents containing a 'pack' field into multiple # values # # e.g. a single update looks like # # { # "doc_id" : 549dae9e0a2a615c0c7f0c98, # "project_id" : 549dae9c0a2a615c0c7f0c8c, # "op" : [ {"p" : 6981, "d" : "?" } ], # "meta" : { "user_id" : 52933..., "start_ts" : 1422310693931, "end_ts" : 1422310693931 }, # "v" : 17082 # } # # and a pack looks like this # # { # "doc_id" : 549dae9e0a2a615c0c7f0c98, # "project_id" : 549dae9c0a2a615c0c7f0c8c, # "pack" : [ U1, U2, U3, ...., UN], # "meta" : { "user_id" : 52933..., "start_ts" : 1422310693931, "end_ts" : 1422310693931 }, # "v" : 17082 # } # # where U1, U2, U3, .... are single updates stripped of their # doc_id and project_id fields (which are the same for all the # updates in the pack). # # The pack itself has v and meta fields, this makes it possible to # treat packs and single updates in the same way. # # The v field of the pack itself is from the first entry U1 # The meta.end_ts field of the pack itself is from the last entry UN. findDocResults: (collection, query, limit, callback) -> # query - the mongo query selector, includes both the doc_id/project_id and # the range on v # limit - the mongo limit, we need to apply it after unpacking any # packs sort = {} sort['v'] = -1; cursor = collection .find( query ) .sort( sort ) # if we have packs, we will trim the results more later after expanding them if limit? cursor.limit(limit) # take the part of the query which selects the range over the parameter rangeQuery = query['v'] # helper function to check if an item from a pack is inside the # desired range filterFn = (item) -> return false if rangeQuery?['$gte']? && item['v'] < rangeQuery['$gte'] return false if rangeQuery?['$lte']? && item['v'] > rangeQuery['$lte'] return false if rangeQuery?['$lt']? && item['v'] >= rangeQuery['$lt'] return false if rangeQuery?['$gt']? && item['v'] <= rangeQuery['$gt'] return true versionOrder = (a, b) -> b.v - a.v # create a query which can be used to select the entries BEFORE # the range because we sometimes need to find extra ones (when the # boundary falls in the middle of a pack) extraQuery = _.clone(query) # The pack uses its first entry for its metadata and v, so the # only queries where we might not get all the packs are those for # $gt and $gte (i.e. we need to find packs which start before our # range but end in it) if rangeQuery?['$gte']? extraQuery['v'] = {'$lt' : rangeQuery['$gte']} else if rangeQuery?['$gt'] extraQuery['v'] = {'$lte' : rangeQuery['$gt']} else delete extraQuery['v'] needMore = false # keep track of whether we need to load more data updates = [] # used to accumulate the set of results # FIXME: packs are big so we should accumulate the results # incrementally instead of using .toArray() to avoid reading all # of the changes into memory cursor.toArray (err, result) -> unpackedSet = PackManager._unpackResults(result) updates = PackManager._filterAndLimit(updates, unpackedSet, filterFn, limit) # check if we need to retrieve more data, because there is a # pack that crosses into our range last = if unpackedSet.length then unpackedSet[unpackedSet.length-1] else null if limit? && updates.length == limit needMore = false else if extraQuery['v']? && last? && filterFn(last) needMore = true else if extraQuery['v']? && updates.length == 0 needMore = true if needMore # we do need an extra result set extra = collection .find(extraQuery) .sort(sort) .limit(1) extra.toArray (err, result2) -> if err? return callback err, updates.sort versionOrder else extraSet = PackManager._unpackResults(result2) updates = PackManager._filterAndLimit(updates, extraSet, filterFn, limit) callback err, updates.sort versionOrder return if err? callback err, result else callback err, updates.sort versionOrder findProjectResults: (collection, query, limit, callback) -> # query - the mongo query selector, includes both the doc_id/project_id and # the range on meta.end_ts # limit - the mongo limit, we need to apply it after unpacking any # packs sort = {} sort['meta.end_ts'] = -1; projection = {"op":false, "pack.op": false} cursor = collection .find( query, projection ) # no need to return the op only need version info .sort( sort ) # if we have packs, we will trim the results more later after expanding them if limit? cursor.limit(limit) # take the part of the query which selects the range over the parameter before = query['meta.end_ts']?['$lt'] # may be null updates = [] # used to accumulate the set of results # FIXME: packs are big so we should accumulate the results # incrementally instead of using .toArray() to avoid reading all # of the changes into memory cursor.toArray (err, result) -> if err? return callback err, result if result.length == 0 && not before? # no results and no time range specified return callback err, result unpackedSet = PackManager._unpackResults(result) if limit? unpackedSet = unpackedSet.slice(0, limit) # find the end time of the last result, we will take all the # results up to this, and then all the changes at that time # (without imposing a limit) and any overlapping packs cutoff = if unpackedSet.length then unpackedSet[unpackedSet.length-1].meta.end_ts else null filterFn = (item) -> ts = item?.meta?.end_ts return false if before? && ts >= before return false if cutoff? && ts < cutoff return true timeOrder = (a, b) -> (b.meta.end_ts - a.meta.end_ts) || documentOrder(a, b) documentOrder = (a, b) -> x = a.doc_id.valueOf() y = b.doc_id.valueOf() if x > y then 1 else if x < y then -1 else 0 updates = PackManager._filterAndLimit(updates, unpackedSet, filterFn, limit) # get all elements on the lower bound (cutoff) tailQuery = _.clone(query) tailQuery['meta.end_ts'] = cutoff tail = collection .find(tailQuery, projection) .sort(sort) # now find any packs that overlap with the time window from outside # cutoff before # --|-----wanted-range--|------------------ time=> # |-------------|pack(end_ts) # # these were not picked up by the original query because # end_ts>before but the beginning of the pack may be in the time range overlapQuery = _.clone(query) if before? && cutoff? overlapQuery['meta.end_ts'] = {"$gte": before} overlapQuery['meta.start_ts'] = {"$lte": before } else if before? && not cutoff? overlapQuery['meta.end_ts'] = {"$gte": before} overlapQuery['meta.start_ts'] = {"$lte": before } else if not before? && cutoff? overlapQuery['meta.end_ts'] = {"$gte": cutoff} # we already have these?? else if not before? && not cutoff? overlapQuery['meta.end_ts'] = {"$gte": 0 } # shouldn't happen?? overlap = collection .find(overlapQuery, projection) .sort(sort) # we don't specify a limit here, as there could be any number of overlaps # NB. need to catch items in original query and followup query for duplicates applyAndUpdate = (result) -> extraSet = PackManager._unpackResults(result) # note: final argument is null, no limit applied because we # need all the updates at the final time to avoid breaking # the changeset into parts updates = PackManager._filterAndLimit(updates, extraSet, filterFn, null) tail.toArray (err, result2) -> if err? return callback err, updates.sort timeOrder else applyAndUpdate result2 overlap.toArray (err, result3) -> if err? return callback err, updates.sort timeOrder else applyAndUpdate result3 callback err, updates.sort timeOrder _unpackResults: (updates) -> # iterate over the updates, if there's a pack, expand it into ops and # insert it into the array at that point result = [] updates.forEach (item) -> if item.pack? all = PackManager._explodePackToOps item result = result.concat all else result.push item return result _explodePackToOps: (packObj) -> # convert a pack into an array of ops doc_id = packObj.doc_id project_id = packObj.project_id result = packObj.pack.map (item) -> item.doc_id = doc_id item.project_id = project_id item return result.reverse() _filterAndLimit: (results, extra, filterFn, limit) -> # update results with extra docs, after filtering and limiting filtered = extra.filter(filterFn) newResults = results.concat filtered # remove duplicates seen = {} newResults = newResults.filter (item) -> key = item.doc_id + ' ' + item.v if seen[key] return false else seen[key] = true return true newResults.slice(0, limit) if limit? return newResults MAX_SIZE: 1024*1024 # make these configurable parameters MAX_COUNT: 512 convertDocsToPacks: (docs, callback) -> packs = [] top = null docs.forEach (d,i) -> # skip existing packs if d.pack? top = null return # skip temporary ops (we could pack these into temporary packs in future) if d.expiresAt? top = null return sz = BSON.calculateObjectSize(d) if top? && top.pack.length < PackManager.MAX_COUNT && top.sz + sz < PackManager.MAX_SIZE top.pack = top.pack.concat {v: d.v, meta: d.meta, op: d.op, _id: d._id} top.sz += sz top.v_end = d.v top.meta.end_ts = d.meta.end_ts return else # create a new pack top = _.clone(d) top.pack = [ {v: d.v, meta: d.meta, op: d.op, _id: d._id} ] top.meta = { start_ts: d.meta.start_ts, end_ts: d.meta.end_ts } top.sz = sz top.v_end = d.v delete top.op delete top._id packs.push top callback(null, packs) checkHistory: (docs, callback) -> errors = [] prev = null error = (args...) -> errors.push args docs.forEach (d,i) -> if d.pack? n = d.pack.length last = d.pack[n-1] error('bad pack v_end', d) if d.v_end != last.v error('bad pack start_ts', d) if d.meta.start_ts != d.pack[0].meta.start_ts error('bad pack end_ts', d) if d.meta.end_ts != last.meta.end_ts d.pack.forEach (p, i) -> prev = v v = p.v error('bad version', v, 'in', p) if v <= prev #error('expired op', p, 'in pack') if p.expiresAt? else prev = v v = d.v error('bad version', v, 'in', d) if v <= prev if errors.length callback(errors) else callback() insertPack: (packObj, callback) -> bulk = db.docHistory.initializeOrderedBulkOp() doc_id = packObj.doc_id expect_nInserted = 1 expect_nRemoved = packObj.pack.length logger.log {doc_id: doc_id}, "adding pack, removing #{expect_nRemoved} ops" bulk.insert packObj ids = (op._id for op in packObj.pack) bulk.find({_id:{$in:ids}}).remove() bulk.execute (err, result) -> if err? logger.error {doc_id: doc_id}, "error adding pack" callback(err, result) else if result.nInserted != expect_nInserted or result.nRemoved != expect_nRemoved logger.error {doc_id: doc_id, result}, "unexpected result adding pack" callback(new Error( msg: 'unexpected result' expected: {expect_nInserted, expect_nRemoved} ), result) else db.docHistoryStats.update {doc_id:doc_id}, { $inc:{update_count:-expect_nRemoved}, $currentDate:{last_packed:true} }, {upsert:true}, () -> callback(err, result) # retrieve document ops/packs and check them getDocHistory: (doc_id, callback) -> db.docHistory.find({doc_id:ObjectId(doc_id)}).sort {v:1}, (err, docs) -> return callback(err) if err? # for safety, do a consistency check of the history logger.log {doc_id}, "checking history for document" PackManager.checkHistory docs, (err) -> return callback(err) if err? callback(err, docs) #PackManager.deleteExpiredPackOps docs, (err) -> # return callback(err) if err? # callback err, docs packDocHistory: (doc_id, options, callback) -> if typeof callback == "undefined" and typeof options == 'function' callback = options options = {} LockManager.runWithLock( "HistoryLock:#{doc_id}", (releaseLock) -> PackManager._packDocHistory(doc_id, options, releaseLock) , callback ) _packDocHistory: (doc_id, options, callback) -> logger.log {doc_id},"starting pack operation for document history" PackManager.getDocHistory doc_id, (err, docs) -> return callback(err) if err? origDocs = 0 origPacks = 0 for d in docs if d.pack? then origPacks++ else origDocs++ PackManager.convertDocsToPacks docs, (err, packs) -> return callback(err) if err? total = 0 for p in packs total = total + p.pack.length logger.log {doc_id, origDocs, origPacks, newPacks: packs.length, totalOps: total}, "document stats" if packs.length if options['dry-run'] logger.log {doc_id}, 'dry-run, skipping write packs' return callback() PackManager.savePacks packs, (err) -> return callback(err) if err? # check the history again PackManager.getDocHistory doc_id, callback else logger.log {doc_id}, "no packs to write" # keep a record that we checked this one to avoid rechecking it db.docHistoryStats.update {doc_id:doc_id}, { $currentDate:{last_checked:true} }, {upsert:true}, () -> callback null, null DB_WRITE_DELAY: 100 savePacks: (packs, callback) -> async.eachSeries packs, PackManager.safeInsert, (err, result) -> if err? logger.log {err, result}, "error writing packs" callback err, result else callback() safeInsert: (packObj, callback) -> PackManager.insertPack packObj, (err, result) -> setTimeout () -> callback(err,result) , PackManager.DB_WRITE_DELAY deleteExpiredPackOps: (docs, callback) -> now = Date.now() toRemove = [] toUpdate = [] docs.forEach (d,i) -> if d.pack? newPack = d.pack.filter (op) -> if op.expiresAt? then op.expiresAt > now else true if newPack.length == 0 toRemove.push d else if newPack.length < d.pack.length # adjust the pack properties d.pack = newPack first = d.pack[0] last = d.pack[d.pack.length - 1] d.v_end = last.v d.meta.start_ts = first.meta.start_ts d.meta.end_ts = last.meta.end_ts toUpdate.push d if toRemove.length or toUpdate.length bulk = db.docHistory.initializeOrderedBulkOp() toRemove.forEach (pack) -> console.log "would remove", pack #bulk.find({_id:pack._id}).removeOne() toUpdate.forEach (pack) -> console.log "would update", pack #bulk.find({_id:pack._id}).updateOne(pack); bulk.execute callback else callback() insertCompressedUpdates: (project_id, doc_id, lastUpdate, newUpdates, temporary, callback = (error) ->) -> return callback() if newUpdates.length == 0 updatesToFlush = [] updatesRemaining = newUpdates.slice() n = lastUpdate?.n || 0 sz = lastUpdate?.sz || 0 while updatesRemaining.length and n < PackManager.MAX_COUNT and sz < PackManager.MAX_SIZE nextUpdate = updatesRemaining[0] nextUpdateSize = BSON.calculateObjectSize(nextUpdate) if nextUpdateSize + sz > PackManager.MAX_SIZE and n > 0 break n++ sz += nextUpdateSize updatesToFlush.push updatesRemaining.shift() PackManager.flushCompressedUpdates project_id, doc_id, lastUpdate, updatesToFlush, temporary, (error) -> return callback(error) if error? PackManager.insertCompressedUpdates project_id, doc_id, null, updatesRemaining, temporary, callback flushCompressedUpdates: (project_id, doc_id, lastUpdate, newUpdates, temporary, callback = (error) ->) -> return callback() if newUpdates.length == 0 if lastUpdate? and not (temporary and ((Date.now() - lastUpdate.meta?.start_ts) > 1 * DAYS)) PackManager.appendUpdatesToExistingPack project_id, doc_id, lastUpdate, newUpdates, temporary, callback else PackManager.insertUpdatesIntoNewPack project_id, doc_id, newUpdates, temporary, callback insertUpdatesIntoNewPack: (project_id, doc_id, newUpdates, temporary, callback = (error) ->) -> first = newUpdates[0] last = newUpdates[newUpdates.length - 1] n = newUpdates.length sz = BSON.calculateObjectSize(newUpdates) newPack = project_id: ObjectId(project_id.toString()) doc_id: ObjectId(doc_id.toString()) pack: newUpdates n: n sz: sz meta: start_ts: first.meta.start_ts end_ts: last.meta.end_ts v: first.v v_end: last.v if temporary newPack.expiresAt = new Date(Date.now() + 7 * DAYS) logger.log {project_id, doc_id, newUpdates}, "inserting updates into new pack" db.docHistory.insert newPack, callback appendUpdatesToExistingPack: (project_id, doc_id, lastUpdate, newUpdates, temporary, callback = (error) ->) -> first = newUpdates[0] last = newUpdates[newUpdates.length - 1] n = newUpdates.length sz = BSON.calculateObjectSize(newUpdates) query = _id: lastUpdate._id project_id: ObjectId(project_id.toString()) doc_id: ObjectId(doc_id.toString()) pack: {$exists: true} update = $push: "pack": {$each: newUpdates} $inc: "n": n "sz": sz $set: "meta.end_ts": last.meta.end_ts "v_end": last.v if lastUpdate.expiresAt and temporary update.$set.expiresAt = new Date(Date.now() + 7 * DAYS) logger.log {project_id, doc_id, lastUpdate, newUpdates}, "appending updates to existing pack" db.docHistory.findAndModify {query, update}, callback