overleaf/services/track-changes/app/coffee/MongoPackManager.coffee

async = require "async"
_ = require "underscore"

module.exports = MongoPackManager =
	# The following functions implement methods like a mongo find, but
	# expands any documents containing a 'pack' field into multiple
	# values
	#
	#  e.g.  a single update looks like
	#
	#   {
	#     "doc_id" : 549dae9e0a2a615c0c7f0c98,
	#     "project_id" : 549dae9c0a2a615c0c7f0c8c,
	#     "op" : [ {"p" : 6981,	"d" : "?"	} ],
	#     "meta" : {	"user_id" : 52933..., "start_ts" : 1422310693931,	"end_ts" : 1422310693931 },
	#     "v" : 17082
	#   }
	#
	#  and a pack looks like this
	#
	#   {
	#     "doc_id" : 549dae9e0a2a615c0c7f0c98,
	#     "project_id" : 549dae9c0a2a615c0c7f0c8c,
	#     "pack" : [ D1, D2, D3, ....],
	#     "meta" : {	"user_id" : 52933..., "start_ts" : 1422310693931,	"end_ts" : 1422310693931 },
	#     "v" : 17082
	#   }
	#
	#  where D1, D2, D3, .... are single updates stripped of their
	#  doc_id and project_id fields (which are the same for all the
	#  updates in the pack).  The meta and v fields of the pack itself
	#  are those of the first entry in the pack D1 (this makes it
	#  possible to treat packs and single updates in the same way).


	findDocResults: (collection, query, limit, callback) ->
		# query - the mongo query selector, includes both the doc_id/project_id and
		# the range on v
		# limit - the mongo limit, we need to apply it after unpacking any
		# packs

		sort = {}
		sort['v'] = -1;
		cursor = collection
			.find( query )
			.sort( sort )
		# if we have packs, we will trim the results more later after expanding them
		if limit?
			cursor.limit(limit)

		# take the part of the query which selects the range over the parameter
		rangeQuery = query['v']

		# helper function to check if an item from a pack is inside the
		# desired range
		filterFn = (item) ->
			return false if rangeQuery?['$gte']? && item['v'] < rangeQuery['$gte']
			return false if rangeQuery?['$lte']? && item['v'] > rangeQuery['$lte']
			return false if rangeQuery?['$lt']? && item['v'] >= rangeQuery['$lt']
			return false if rangeQuery?['$gt']? && item['v'] <= rangeQuery['$gt']
			return true

		versionOrder = (a, b) ->
			b.v - a.v

		# create a query which can be used to select the entries BEFORE
		# the range because we sometimes need to find extra ones (when the
		# boundary falls in the middle of a pack)
		extraQuery = _.clone(query)
		# The pack uses its first entry for its metadata and v, so the
		# only queries where we might not get all the packs are those for
		# $gt and $gte (i.e. we need to find packs which start before our
		# range but end in it)
		if rangeQuery?['$gte']?
			extraQuery['v'] = {'$lt' : rangeQuery['$gte']}
		else if rangeQuery?['$gt']
			extraQuery['v'] = {'$lte' : rangeQuery['$gt']}
		else
			delete extraQuery['v']

		needMore = false  # keep track of whether we need to load more data
		updates = [] # used to accumulate the set of results
		cursor.toArray (err, result) ->
			unpackedSet = MongoPackManager._unpackResults(result)
			updates = MongoPackManager._filterAndLimit(updates, unpackedSet, filterFn, limit)
			# check if we need to retrieve more data, because there is a
			# pack that crosses into our range
			last = if unpackedSet.length then unpackedSet[unpackedSet.length-1] else null
			if limit? && updates.length == limit
				needMore = false
			else if extraQuery['v']? && last? && filterFn(last)
				needMore = true
			else if extraQuery['v']? && updates.length == 0
				needMore = true
			if needMore
				# we do need an extra result set
				extra = collection
					.find(extraQuery)
					.sort(sort)
					.limit(1)
				extra.toArray (err, result2) ->
					if err?
						return callback err, updates.sort versionOrder
					else
						extraSet = MongoPackManager._unpackResults(result2)
						updates = MongoPackManager._filterAndLimit(updates, extraSet, filterFn, limit)
						callback err, updates.sort versionOrder
				return
			if err?
				callback err, result
			else
				callback err, updates.sort versionOrder

	findProjectResults: (collection, query, limit, callback) ->
		# query - the mongo query selector, includes both the doc_id/project_id and
		# the range on v or meta.end_ts
		# limit - the mongo limit, we need to apply it after unpacking any
		# packs

		sort = {}
		sort['meta.end_ts'] = -1;

		projection = {"op":false, "pack.op": false}
		cursor = collection
			.find( query, projection ) # no need to return the op only need version info
			.sort( sort )
		# if we have packs, we will trim the results more later after expanding them
		if limit?
			cursor.limit(limit)

		# take the part of the query which selects the range over the parameter
		before = query['meta.end_ts']?['$lt']  # may be null

		updates = [] # used to accumulate the set of results

		cursor.toArray (err, result) ->
			if err?
				return callback err, result
			if result.length == 0 && not before?  # no results and no time range specified
				return callback err, result

			unpackedSet = MongoPackManager._unpackResults(result)
			if limit?
				unpackedSet = unpackedSet.slice(0, limit)
			# find the end time of the last result, we will take all the
			# results up to this, and then all the changes at that time
			# (without imposing a limit) and any overlapping packs
			cutoff = if unpackedSet.length then unpackedSet[unpackedSet.length-1].meta.end_ts else null
			#console.log 'before is', before
			#console.log 'cutoff is', cutoff
			#console.log 'limit  is', limit

			filterFn = (item) ->
				ts = item?.meta?.end_ts
				#console.log 'checking', ts, before, cutoff
				return false if before? && ts >= before
				return false if cutoff? && ts < cutoff
				return true

			timeOrder = (a, b) ->
				(b.meta.end_ts - a.meta.end_ts) || documentOrder(a, b)

			documentOrder = (a, b) ->
				x = a.doc_id.valueOf()
				y = b.doc_id.valueOf()
				if x > y then 1 else if x < y then -1 else 0

			updates = MongoPackManager._filterAndLimit(updates, unpackedSet, filterFn, limit)
			#console.log 'initial updates are', updates

			# get all elements on the lower bound (cutoff)
			tailQuery = _.clone(query)
			tailQuery['meta.end_ts'] = cutoff
			tail = collection
				.find(tailQuery, projection)
				.sort(sort)

			#console.log 'tailQuery is', tailQuery

			# now find any packs that overlap with the time window
			overlapQuery = _.clone(query)
			if before? && cutoff?
				overlapQuery['meta.end_ts'] = {"$gte": before}
				overlapQuery['pack.0.meta.end_ts'] = {"$lte": before }
			else if before? && not cutoff?
				overlapQuery['meta.end_ts'] = {"$gte": before}
				overlapQuery['pack.0.meta.end_ts'] = {"$lte": before }
			else if not before? && cutoff?
				overlapQuery['meta.end_ts'] = {"$gte": cutoff}
				overlapQuery['pack.0.meta.end_ts'] = {"$gte": 0 }
			else if not before? && not cutoff?
				overlapQuery['meta.end_ts'] = {"$gte": 0 }
				overlapQuery['pack.0.meta.end_ts'] = {"$gte": 0 }
			overlap = collection
				.find(overlapQuery, projection)
				.sort(sort)

			#console.log 'overlapQuery is', overlapQuery

			# we don't specify a limit here, as there could be any number of overlaps
			# NB. need to catch items in original query and followup query for duplicates

			applyAndUpdate = (result) ->
				extraSet = MongoPackManager._unpackResults(result)
				# note: final argument is null, no limit applied because we
				# need all the updates at the final time to avoid breaking
				# the changeset into parts
				updates = MongoPackManager._filterAndLimit(updates, extraSet, filterFn, null)
				#console.log 'extra updates after filterandlimit', updates
			tail.toArray (err, result2) ->
				if err?
					return callback err, updates.sort timeOrder
				else
					applyAndUpdate result2
					overlap.toArray (err, result3) ->
						if err?
							return callback err, updates.sort timeOrder
						else
							applyAndUpdate result3
							callback err, updates.sort timeOrder

	_unpackResults: (updates) ->
		#	iterate over the updates, if there's a pack, expand it into ops and
		# insert it into the array at that point
		result = []
		updates.forEach (item) ->
			if item.pack?
				all = MongoPackManager._explodePackToOps item
				result = result.concat all
			else
				result.push item
		return result

	_explodePackToOps: (packObj) ->
		# convert a pack into an array of ops
		doc_id = packObj.doc_id
		project_id = packObj.project_id
		result = packObj.pack.map (item) ->
			item.doc_id = doc_id
			item.project_id = project_id
			item
		return result.reverse()

	_filterAndLimit: (results, extra, filterFn, limit) ->
		# update results with extra docs, after filtering and limiting
		filtered = extra.filter(filterFn)
		newResults = results.concat filtered
		# remove duplicates
		seen = {}
		newResults = newResults.filter (item) ->
			key = item.doc_id + ' ' + item.v
			#console.log 'key is', key
			if seen[key]
				return false
			else
				seen[key] = true
			return true
		newResults.slice(0, limit) if limit?
		return newResults
moved pack code into MongoPackManager 2015-02-13 11:18:15 -05:00			`async = require "async"`
			`_ = require "underscore"`

			`module.exports = MongoPackManager =`
			`# The following functions implement methods like a mongo find, but`
			`# expands any documents containing a 'pack' field into multiple`
			`# values`
			`#`
			`# e.g. a single update looks like`
			`#`
			`# {`
			`# "doc_id" : 549dae9e0a2a615c0c7f0c98,`
			`# "project_id" : 549dae9c0a2a615c0c7f0c8c,`
			`# "op" : [ {"p" : 6981, "d" : "?" } ],`
			`# "meta" : { "user_id" : 52933..., "start_ts" : 1422310693931, "end_ts" : 1422310693931 },`
			`# "v" : 17082`
			`# }`
			`#`
			`# and a pack looks like this`
			`#`
			`# {`
			`# "doc_id" : 549dae9e0a2a615c0c7f0c98,`
			`# "project_id" : 549dae9c0a2a615c0c7f0c8c,`
			`# "pack" : [ D1, D2, D3, ....],`
			`# "meta" : { "user_id" : 52933..., "start_ts" : 1422310693931, "end_ts" : 1422310693931 },`
			`# "v" : 17082`
			`# }`
			`#`
			`# where D1, D2, D3, .... are single updates stripped of their`
			`# doc_id and project_id fields (which are the same for all the`
			`# updates in the pack). The meta and v fields of the pack itself`
			`# are those of the first entry in the pack D1 (this makes it`
			`# possible to treat packs and single updates in the same way).`


			`findDocResults: (collection, query, limit, callback) ->`
			`# query - the mongo query selector, includes both the doc_id/project_id and`
			`# the range on v`
			`# limit - the mongo limit, we need to apply it after unpacking any`
			`# packs`

			`sort = {}`
			`sort['v'] = -1;`
			`cursor = collection`
			`.find( query )`
			`.sort( sort )`
			`# if we have packs, we will trim the results more later after expanding them`
			`if limit?`
			`cursor.limit(limit)`

			`# take the part of the query which selects the range over the parameter`
			`rangeQuery = query['v']`

			`# helper function to check if an item from a pack is inside the`
			`# desired range`
			`filterFn = (item) ->`
			`return false if rangeQuery?['$gte']? && item['v'] < rangeQuery['$gte']`
			`return false if rangeQuery?['$lte']? && item['v'] > rangeQuery['$lte']`
			`return false if rangeQuery?['$lt']? && item['v'] >= rangeQuery['$lt']`
			`return false if rangeQuery?['$gt']? && item['v'] <= rangeQuery['$gt']`
			`return true`

ensure document results are in version order 2015-02-16 08:43:27 -05:00			`versionOrder = (a, b) ->`
			`b.v - a.v`

moved pack code into MongoPackManager 2015-02-13 11:18:15 -05:00			`# create a query which can be used to select the entries BEFORE`
			`# the range because we sometimes need to find extra ones (when the`
			`# boundary falls in the middle of a pack)`
			`extraQuery = _.clone(query)`
			`# The pack uses its first entry for its metadata and v, so the`
			`# only queries where we might not get all the packs are those for`
			`# $gt and $gte (i.e. we need to find packs which start before our`
			`# range but end in it)`
			`if rangeQuery?['$gte']?`
			`extraQuery['v'] = {'$lt' : rangeQuery['$gte']}`
			`else if rangeQuery?['$gt']`
			`extraQuery['v'] = {'$lte' : rangeQuery['$gt']}`
			`else`
			`delete extraQuery['v']`

			`needMore = false # keep track of whether we need to load more data`
			`updates = [] # used to accumulate the set of results`
			`cursor.toArray (err, result) ->`
			`unpackedSet = MongoPackManager._unpackResults(result)`
			`updates = MongoPackManager._filterAndLimit(updates, unpackedSet, filterFn, limit)`
			`# check if we need to retrieve more data, because there is a`
			`# pack that crosses into our range`
			`last = if unpackedSet.length then unpackedSet[unpackedSet.length-1] else null`
			`if limit? && updates.length == limit`
			`needMore = false`
			`else if extraQuery['v']? && last? && filterFn(last)`
			`needMore = true`
			`else if extraQuery['v']? && updates.length == 0`
			`needMore = true`
			`if needMore`
			`# we do need an extra result set`
			`extra = collection`
			`.find(extraQuery)`
			`.sort(sort)`
			`.limit(1)`
			`extra.toArray (err, result2) ->`
			`if err?`
ensure document results are in version order 2015-02-16 08:43:27 -05:00			`return callback err, updates.sort versionOrder`
moved pack code into MongoPackManager 2015-02-13 11:18:15 -05:00			`else`
			`extraSet = MongoPackManager._unpackResults(result2)`
			`updates = MongoPackManager._filterAndLimit(updates, extraSet, filterFn, limit)`
ensure document results are in version order 2015-02-16 08:43:27 -05:00			`callback err, updates.sort versionOrder`
moved pack code into MongoPackManager 2015-02-13 11:18:15 -05:00			`return`
			`if err?`
			`callback err, result`
			`else`
ensure document results are in version order 2015-02-16 08:43:27 -05:00			`callback err, updates.sort versionOrder`
moved pack code into MongoPackManager 2015-02-13 11:18:15 -05:00
			`findProjectResults: (collection, query, limit, callback) ->`
			`# query - the mongo query selector, includes both the doc_id/project_id and`
			`# the range on v or meta.end_ts`
			`# limit - the mongo limit, we need to apply it after unpacking any`
			`# packs`

			`sort = {}`
			`sort['meta.end_ts'] = -1;`

			`projection = {"op":false, "pack.op": false}`
			`cursor = collection`
			`.find( query, projection ) # no need to return the op only need version info`
			`.sort( sort )`
			`# if we have packs, we will trim the results more later after expanding them`
			`if limit?`
			`cursor.limit(limit)`

			`# take the part of the query which selects the range over the parameter`
			`before = query['meta.end_ts']?['$lt'] # may be null`

			`updates = [] # used to accumulate the set of results`

			`cursor.toArray (err, result) ->`
			`if err?`
			`return callback err, result`
			`if result.length == 0 && not before? # no results and no time range specified`
			`return callback err, result`

			`unpackedSet = MongoPackManager._unpackResults(result)`
			`if limit?`
			`unpackedSet = unpackedSet.slice(0, limit)`
			`# find the end time of the last result, we will take all the`
			`# results up to this, and then all the changes at that time`
			`# (without imposing a limit) and any overlapping packs`
			`cutoff = if unpackedSet.length then unpackedSet[unpackedSet.length-1].meta.end_ts else null`
			`#console.log 'before is', before`
			`#console.log 'cutoff is', cutoff`
			`#console.log 'limit is', limit`

			`filterFn = (item) ->`
			`ts = item?.meta?.end_ts`
			`#console.log 'checking', ts, before, cutoff`
			`return false if before? && ts >= before`
			`return false if cutoff? && ts < cutoff`
			`return true`

fix project results order, sort by time 2015-02-16 08:43:11 -05:00			`timeOrder = (a, b) ->`
sort by doc_id to ensure consistent order for all changes with the same timestamp 2015-02-16 11:44:06 -05:00			`(b.meta.end_ts - a.meta.end_ts) \|\| documentOrder(a, b)`

			`documentOrder = (a, b) ->`
			`x = a.doc_id.valueOf()`
			`y = b.doc_id.valueOf()`
			`if x > y then 1 else if x < y then -1 else 0`
fix project results order, sort by time 2015-02-16 08:43:11 -05:00
moved pack code into MongoPackManager 2015-02-13 11:18:15 -05:00			`updates = MongoPackManager._filterAndLimit(updates, unpackedSet, filterFn, limit)`
			`#console.log 'initial updates are', updates`

			`# get all elements on the lower bound (cutoff)`
			`tailQuery = _.clone(query)`
			`tailQuery['meta.end_ts'] = cutoff`
			`tail = collection`
			`.find(tailQuery, projection)`
			`.sort(sort)`

			`#console.log 'tailQuery is', tailQuery`

			`# now find any packs that overlap with the time window`
			`overlapQuery = _.clone(query)`
			`if before? && cutoff?`
			`overlapQuery['meta.end_ts'] = {"$gte": before}`
			`overlapQuery['pack.0.meta.end_ts'] = {"$lte": before }`
			`else if before? && not cutoff?`
			`overlapQuery['meta.end_ts'] = {"$gte": before}`
			`overlapQuery['pack.0.meta.end_ts'] = {"$lte": before }`
			`else if not before? && cutoff?`
			`overlapQuery['meta.end_ts'] = {"$gte": cutoff}`
			`overlapQuery['pack.0.meta.end_ts'] = {"$gte": 0 }`
			`else if not before? && not cutoff?`
			`overlapQuery['meta.end_ts'] = {"$gte": 0 }`
			`overlapQuery['pack.0.meta.end_ts'] = {"$gte": 0 }`
			`overlap = collection`
			`.find(overlapQuery, projection)`
			`.sort(sort)`

			`#console.log 'overlapQuery is', overlapQuery`

			`# we don't specify a limit here, as there could be any number of overlaps`
			`# NB. need to catch items in original query and followup query for duplicates`

			`applyAndUpdate = (result) ->`
			`extraSet = MongoPackManager._unpackResults(result)`
			`# note: final argument is null, no limit applied because we`
			`# need all the updates at the final time to avoid breaking`
			`# the changeset into parts`
			`updates = MongoPackManager._filterAndLimit(updates, extraSet, filterFn, null)`
			`#console.log 'extra updates after filterandlimit', updates`
			`tail.toArray (err, result2) ->`
			`if err?`
fix project results order, sort by time 2015-02-16 08:43:11 -05:00			`return callback err, updates.sort timeOrder`
moved pack code into MongoPackManager 2015-02-13 11:18:15 -05:00			`else`
			`applyAndUpdate result2`
			`overlap.toArray (err, result3) ->`
			`if err?`
fix project results order, sort by time 2015-02-16 08:43:11 -05:00			`return callback err, updates.sort timeOrder`
moved pack code into MongoPackManager 2015-02-13 11:18:15 -05:00			`else`
			`applyAndUpdate result3`
fix project results order, sort by time 2015-02-16 08:43:11 -05:00			`callback err, updates.sort timeOrder`
moved pack code into MongoPackManager 2015-02-13 11:18:15 -05:00
			`_unpackResults: (updates) ->`
			`# iterate over the updates, if there's a pack, expand it into ops and`
			`# insert it into the array at that point`
			`result = []`
			`updates.forEach (item) ->`
			`if item.pack?`
			`all = MongoPackManager._explodePackToOps item`
			`result = result.concat all`
			`else`
			`result.push item`
			`return result`

			`_explodePackToOps: (packObj) ->`
			`# convert a pack into an array of ops`
			`doc_id = packObj.doc_id`
			`project_id = packObj.project_id`
			`result = packObj.pack.map (item) ->`
			`item.doc_id = doc_id`
			`item.project_id = project_id`
			`item`
			`return result.reverse()`

			`_filterAndLimit: (results, extra, filterFn, limit) ->`
			`# update results with extra docs, after filtering and limiting`
			`filtered = extra.filter(filterFn)`
			`newResults = results.concat filtered`
move update deduplication to filterAndLimit method 2015-02-16 10:37:47 -05:00			`# remove duplicates`
			`seen = {}`
			`newResults = newResults.filter (item) ->`
			`key = item.doc_id + ' ' + item.v`
			`#console.log 'key is', key`
			`if seen[key]`
			`return false`
			`else`
			`seen[key] = true`
			`return true`
moved pack code into MongoPackManager 2015-02-13 11:18:15 -05:00			`newResults.slice(0, limit) if limit?`
			`return newResults`