From 387ecac6cb9e4362e8527f7c6add3689cd391975 Mon Sep 17 00:00:00 2001 From: Brian Gough Date: Wed, 17 May 2017 16:11:48 +0100 Subject: [PATCH 1/6] log for each get/try/release lock --- .../document-updater/app/coffee/LockManager.coffee | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/services/document-updater/app/coffee/LockManager.coffee b/services/document-updater/app/coffee/LockManager.coffee index 37cbe49914..1f710b448d 100644 --- a/services/document-updater/app/coffee/LockManager.coffee +++ b/services/document-updater/app/coffee/LockManager.coffee @@ -30,14 +30,16 @@ module.exports = LockManager = tryLock : (doc_id, callback = (err, isFree)->)-> lockValue = LockManager.randomLock() key = keys.blockingKey(doc_id:doc_id) + logger.log {doc_id: doc_id, key: key}, "tryLock" rclient.set key, lockValue, "EX", @LOCK_TTL, "NX", (err, gotLock)-> return callback(err) if err? if gotLock == "OK" metrics.inc "doc-not-blocking" + logger.log {doc_id: doc_id, key: key, lockValue: lockValue}, "got lock" callback err, true, lockValue else metrics.inc "doc-blocking" - logger.log {doc_id}, "doc is locked" + logger.log {doc_id: doc_id, key: key}, "doc is locked" callback err, false getLock: (doc_id, callback = (error, lockValue) ->) -> @@ -54,6 +56,7 @@ module.exports = LockManager = if gotLock callback(null, lockValue) else + logger.log {doc_id: doc_id, delay: testInterval}, "will retry lock" setTimeout attempt, testInterval # back off when the lock is taken to avoid overloading testInterval = Math.min(testInterval * 2, LockManager.MAX_TEST_INTERVAL) @@ -75,7 +78,9 @@ module.exports = LockManager = rclient.eval LockManager.unlockScript, 1, key, lockValue, (err, result) -> if err? return callback(err) - if result? and result isnt 1 # successful unlock should release exactly one key - logger.error {doc_id:doc_id, lockValue:lockValue, redis_err:err, redis_result:result}, "unlocking error" + else if result? and result isnt 1 # successful unlock should release exactly one key + logger.error {doc_id:doc_id, key:key, lockValue:lockValue, redis_err:err, redis_result:result}, "unlocking error" return callback(new Error("tried to release timed out lock")) - callback(err,result) + else + logger.log {doc_id:doc_id, key:key, lockValue:lockValue}, "released lock" + callback(err,result) From b6efb051e145367caab6e10b8282c3e9b16af74d Mon Sep 17 00:00:00 2001 From: Brian Gough Date: Thu, 18 May 2017 11:00:07 +0100 Subject: [PATCH 2/6] add profiling --- .../app/coffee/Profiler.coffee | 28 +++++++++++++++++++ .../app/coffee/UpdateManager.coffee | 23 +++++++++++++-- 2 files changed, 48 insertions(+), 3 deletions(-) create mode 100644 services/document-updater/app/coffee/Profiler.coffee diff --git a/services/document-updater/app/coffee/Profiler.coffee b/services/document-updater/app/coffee/Profiler.coffee new file mode 100644 index 0000000000..8aacf2a0a8 --- /dev/null +++ b/services/document-updater/app/coffee/Profiler.coffee @@ -0,0 +1,28 @@ +Settings = require('settings-sharelatex') +logger = require('logger-sharelatex') + +deltaMs = (ta, tb) -> + return Math.floor(((ta[0]-tb[0])*1e9 + (ta[1]-tb[1]))*1e-6) + +module.exports = class Profiler + LOG_CUTOFF_TIME: 100 + + constructor: (@name, @args) -> + @t0 = @t = process.hrtime() + @updateTimes = [] + + log: (label) -> + t1 = process.hrtime() + dtMilliSec = deltaMs(t1, @t) + @t = t1 + @updateTimes.push [label, dtMilliSec] # timings in ms + return @ # make it chainable + + end: (message) -> + totalTime = deltaMs(@t, @t0) + return if totalTime < @LOG_CUTOFF_TIME # skip anything less than cutoff + args = {} + for k,v of @args + args[k] = v + args.updateTimes = @updateTimes + logger.log args, @name diff --git a/services/document-updater/app/coffee/UpdateManager.coffee b/services/document-updater/app/coffee/UpdateManager.coffee index 269b16ee67..70caaf1e03 100644 --- a/services/document-updater/app/coffee/UpdateManager.coffee +++ b/services/document-updater/app/coffee/UpdateManager.coffee @@ -10,6 +10,7 @@ Metrics = require "./Metrics" Errors = require "./Errors" DocumentManager = require "./DocumentManager" RangesManager = require "./RangesManager" +Profiler = require "./Profiler" module.exports = UpdateManager = processOutstandingUpdates: (project_id, doc_id, callback = (error) ->) -> @@ -20,13 +21,17 @@ module.exports = UpdateManager = callback() processOutstandingUpdatesWithLock: (project_id, doc_id, callback = (error) ->) -> + profile = new Profiler("processOutstandingUpdatesWithLock", {project_id, doc_id}) LockManager.tryLock doc_id, (error, gotLock, lockValue) => return callback(error) if error? return callback() if !gotLock + profile.log("tryLock") UpdateManager.processOutstandingUpdates project_id, doc_id, (error) -> return UpdateManager._handleErrorInsideLock(doc_id, lockValue, error, callback) if error? + profile.log("processOutstandingUpdates") LockManager.releaseLock doc_id, lockValue, (error) => return callback(error) if error? + profile.log("releaseLock").end() UpdateManager.continueProcessingUpdatesWithLock project_id, doc_id, callback continueProcessingUpdatesWithLock: (project_id, doc_id, callback = (error) ->) -> @@ -38,13 +43,20 @@ module.exports = UpdateManager = callback() fetchAndApplyUpdates: (project_id, doc_id, callback = (error) ->) -> + profile = new Profiler("fetchAndApplyUpdates", {project_id, doc_id}) RealTimeRedisManager.getPendingUpdatesForDoc doc_id, (error, updates) => return callback(error) if error? if updates.length == 0 return callback() - async.eachSeries updates, - (update, cb) -> UpdateManager.applyUpdate project_id, doc_id, update, cb - callback + profile.log("getPendingUpdatesForDoc") + doUpdate = (update, cb)-> + UpdateManager.applyUpdate project_id, doc_id, update, (err) -> + profile.log("applyUpdate") + cb(err) + finalCallback = (err) -> + profile.log("async done").end() + callback(err) + async.eachSeries updates, doUpdate, finalCallback applyUpdate: (project_id, doc_id, update, _callback = (error) ->) -> callback = (error) -> @@ -66,14 +78,19 @@ module.exports = UpdateManager = HistoryManager.recordAndFlushHistoryOps project_id, doc_id, appliedOps, historyOpsLength, callback lockUpdatesAndDo: (method, project_id, doc_id, args..., callback) -> + profile = new Profiler("lockUpdatesAndDo", {project_id, doc_id}) LockManager.getLock doc_id, (error, lockValue) -> + profile.log("getLock") return callback(error) if error? UpdateManager.processOutstandingUpdates project_id, doc_id, (error) -> return UpdateManager._handleErrorInsideLock(doc_id, lockValue, error, callback) if error? + profile.log("processOutStandingUpdates") method project_id, doc_id, args..., (error, response_args...) -> return UpdateManager._handleErrorInsideLock(doc_id, lockValue, error, callback) if error? + profile.log("method") LockManager.releaseLock doc_id, lockValue, (error) -> return callback(error) if error? + profile.log("releaseLock").end() callback null, response_args... # We held the lock for a while so updates might have queued up UpdateManager.continueProcessingUpdatesWithLock project_id, doc_id From e11f64f83a1f918afa7998b817e551e0d7311a09 Mon Sep 17 00:00:00 2001 From: Brian Gough Date: Thu, 18 May 2017 15:02:08 +0100 Subject: [PATCH 3/6] increase profiling cutoff to 1s --- services/document-updater/app/coffee/Profiler.coffee | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/document-updater/app/coffee/Profiler.coffee b/services/document-updater/app/coffee/Profiler.coffee index 8aacf2a0a8..f4ee219afb 100644 --- a/services/document-updater/app/coffee/Profiler.coffee +++ b/services/document-updater/app/coffee/Profiler.coffee @@ -5,7 +5,7 @@ deltaMs = (ta, tb) -> return Math.floor(((ta[0]-tb[0])*1e9 + (ta[1]-tb[1]))*1e-6) module.exports = class Profiler - LOG_CUTOFF_TIME: 100 + LOG_CUTOFF_TIME: 1000 constructor: (@name, @args) -> @t0 = @t = process.hrtime() From e00f4dde8e16bd433d241c519b04718a3fe722fa Mon Sep 17 00:00:00 2001 From: Brian Gough Date: Thu, 18 May 2017 15:02:44 +0100 Subject: [PATCH 4/6] log profilers start and end times for reference --- services/document-updater/app/coffee/Profiler.coffee | 3 +++ 1 file changed, 3 insertions(+) diff --git a/services/document-updater/app/coffee/Profiler.coffee b/services/document-updater/app/coffee/Profiler.coffee index f4ee219afb..70a0813b67 100644 --- a/services/document-updater/app/coffee/Profiler.coffee +++ b/services/document-updater/app/coffee/Profiler.coffee @@ -9,6 +9,7 @@ module.exports = class Profiler constructor: (@name, @args) -> @t0 = @t = process.hrtime() + @start = new Date() @updateTimes = [] log: (label) -> @@ -25,4 +26,6 @@ module.exports = class Profiler for k,v of @args args[k] = v args.updateTimes = @updateTimes + args.start = @start + args.end = new Date() logger.log args, @name From 95596061d44be949e52b53aea1a52a406c57611d Mon Sep 17 00:00:00 2001 From: Brian Gough Date: Thu, 18 May 2017 15:02:54 +0100 Subject: [PATCH 5/6] clarify calculation of process.hrtime --- services/document-updater/app/coffee/Profiler.coffee | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/services/document-updater/app/coffee/Profiler.coffee b/services/document-updater/app/coffee/Profiler.coffee index 70a0813b67..dc88345334 100644 --- a/services/document-updater/app/coffee/Profiler.coffee +++ b/services/document-updater/app/coffee/Profiler.coffee @@ -2,7 +2,9 @@ Settings = require('settings-sharelatex') logger = require('logger-sharelatex') deltaMs = (ta, tb) -> - return Math.floor(((ta[0]-tb[0])*1e9 + (ta[1]-tb[1]))*1e-6) + nanoSeconds = (ta[0]-tb[0])*1e9 + (ta[1]-tb[1]) + milliSeconds = Math.floor(nanoSeconds*1e-6) + return milliSeconds module.exports = class Profiler LOG_CUTOFF_TIME: 1000 From a5500780711507ebea0d55800c06cb55813ae5ff Mon Sep 17 00:00:00 2001 From: Brian Gough Date: Thu, 18 May 2017 15:04:12 +0100 Subject: [PATCH 6/6] replace verbose logging by slow query log --- .../app/coffee/LockManager.coffee | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/services/document-updater/app/coffee/LockManager.coffee b/services/document-updater/app/coffee/LockManager.coffee index 1f710b448d..d918dc4332 100644 --- a/services/document-updater/app/coffee/LockManager.coffee +++ b/services/document-updater/app/coffee/LockManager.coffee @@ -7,6 +7,8 @@ logger = require "logger-sharelatex" os = require "os" crypto = require "crypto" +Profiler = require "./Profiler" + HOST = os.hostname() PID = process.pid RND = crypto.randomBytes(4).toString('hex') @@ -30,33 +32,36 @@ module.exports = LockManager = tryLock : (doc_id, callback = (err, isFree)->)-> lockValue = LockManager.randomLock() key = keys.blockingKey(doc_id:doc_id) - logger.log {doc_id: doc_id, key: key}, "tryLock" + profile = new Profiler("tryLock", {doc_id, key, lockValue}) rclient.set key, lockValue, "EX", @LOCK_TTL, "NX", (err, gotLock)-> return callback(err) if err? if gotLock == "OK" metrics.inc "doc-not-blocking" - logger.log {doc_id: doc_id, key: key, lockValue: lockValue}, "got lock" + profile.log("got lock").end() callback err, true, lockValue else metrics.inc "doc-blocking" - logger.log {doc_id: doc_id, key: key}, "doc is locked" + profile.log("doc is locked").end() callback err, false getLock: (doc_id, callback = (error, lockValue) ->) -> startTime = Date.now() testInterval = LockManager.LOCK_TEST_INTERVAL + profile = new Profiler("getLock", {doc_id}) do attempt = () -> if Date.now() - startTime > LockManager.MAX_LOCK_WAIT_TIME e = new Error("Timeout") e.doc_id = doc_id + profile.log("timeout").end() return callback(e) LockManager.tryLock doc_id, (error, gotLock, lockValue) -> return callback(error) if error? + profile.log("tryLock") if gotLock + profile.end() callback(null, lockValue) else - logger.log {doc_id: doc_id, delay: testInterval}, "will retry lock" setTimeout attempt, testInterval # back off when the lock is taken to avoid overloading testInterval = Math.min(testInterval * 2, LockManager.MAX_TEST_INTERVAL) @@ -75,12 +80,14 @@ module.exports = LockManager = releaseLock: (doc_id, lockValue, callback)-> key = keys.blockingKey(doc_id:doc_id) + profile = new Profiler("releaseLock", {doc_id, key, lockValue}) rclient.eval LockManager.unlockScript, 1, key, lockValue, (err, result) -> if err? return callback(err) else if result? and result isnt 1 # successful unlock should release exactly one key + profile.log("unlockScript:expired-lock").end() logger.error {doc_id:doc_id, key:key, lockValue:lockValue, redis_err:err, redis_result:result}, "unlocking error" return callback(new Error("tried to release timed out lock")) else - logger.log {doc_id:doc_id, key:key, lockValue:lockValue}, "released lock" + profile.log("unlockScript:ok").end() callback(err,result)