From 8db30020ae5802ca7fdad6e1196756addafc34da Mon Sep 17 00:00:00 2001 From: James Allen Date: Thu, 3 Dec 2015 16:32:20 +0000 Subject: [PATCH] Monitor event loop by looking for skew If we monitor with setImmediate, we miss big blocking loops. For example, suppose we have 1000 1ms loops then a single bad 1000ms loop. setImmediate will only be called at the right time 1/1000 of the time (it has to be the loop just before the bad one). So this monitoring method gives a good average if the std dev is low, but doesn't pick up spikes. Instead, we can monitor the skew from the expected time between setIntervals. In the case above, with a setInterval for 1000ms, we will pick up a skew proportional to the amount of time that it overlaps the bad loop. So 50% change of picking up skew > 500ms, and thus getting a good sense of any spikes. --- libraries/metrics/event_loop.coffee | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/libraries/metrics/event_loop.coffee b/libraries/metrics/event_loop.coffee index ab6955afc7..4e410892bf 100644 --- a/libraries/metrics/event_loop.coffee +++ b/libraries/metrics/event_loop.coffee @@ -1,18 +1,16 @@ -seconds = 1000 - module.exports = EventLoopMonitor = - monitor: (logger) -> - interval = setInterval () -> - EventLoopMonitor.Delay() - , 1 * seconds + monitor: (logger, interval = 1000, log_threshold = 100) -> Metrics = require "./metrics" + + previous = Date.now() + intervalId = setInterval () -> + now = Date.now() + offset = now - previous - interval + if offset > log_threshold + logger.warn {offset: offset}, "slow event loop" + previous = now + Metrics.timing("event-loop-millsec", offset) + , interval + Metrics.registerDestructor () -> - clearInterval(interval) - - Delay: () -> - Metrics = require "./metrics" - t1 = process.hrtime() - setImmediate () -> - delta = process.hrtime(t1) - responseTime = delta[0]*1e6 + delta[1]*1e-3 - Metrics.timing("event-loop-microsec", responseTime) + clearInterval(intervalId)