Add in external health check rather than internal

2024-11-21 20:47:08 -05:00 · 2016-06-23 18:00:03 +01:00 · 2016-06-23 18:00:03 +01:00 · da89ff7172
commit da89ff7172
parent 414ab5d6a9
6 changed files with 113 additions and 75 deletions
--- a/services/document-updater/app.coffee
+++ b/services/document-updater/app.coffee
@ -60,6 +60,13 @@ app.get "/health_check/redis", (req, res, next)->
 	else
 		res.send 500

+app.get "/health_check/redis_cluster", (req, res, next) ->
+	RedisManager.rclient.healthCheck (error, alive) ->
+		if error?
+			logger.err {err: error}, "failed redis cluster health check"
+			res.send 500
+		else
+			res.send 200

 app.use (error, req, res, next) ->
 	if error instanceof Errors.NotFoundError
--- a/services/document-updater/app/coffee/RedisBackend.coffee
+++ b/services/document-updater/app/coffee/RedisBackend.coffee
@ -5,7 +5,6 @@ logger = require "logger-sharelatex"

 class Client
 	constructor: (@clients) ->
-		@HEARTBEAT_INTERVAL = 5000
 		@HEARTBEAT_TIMEOUT = 2000
 		
 	multi: () ->
@ -18,28 +17,41 @@ class Client
 			}
 		)

-	monitorTcpAndReconnect: () ->
-		for client in @clients
+	healthCheck: (callback) ->
+		jobs = @clients.map (client) =>
+			(cb) => @_healthCheckClient(client, cb)
+		async.parallel jobs, callback
+	
+	_healthCheckClient: (client, callback) ->
 		if client.driver == "ioredis"
-				@_monitorCluster(client.rclient)
+			@_healthCheckClusterClient(client, callback)
+		else
+			@_healthCheckNodeRedisClient(client, callback)
 	
-	_monitorCluster: (rclient) ->
-		setInterval () =>
-			# Nodes can come and go as the cluster moves/heals, so each heartbeat
-			# we ask again for the currently known nodes.
-			for node in rclient.nodes("all")
-				@_checkNode(node)
-		, @HEARTBEAT_INTERVAL
+	_healthCheckNodeRedisClient: (client, callback) ->
+		client.healthCheck ?= require("redis-sharelatex").activeHealthCheckRedis(Settings.redis.web)
+		if client.healthCheck.isAlive()
+			return callback()
+		else
+			return callback(new Error("node-redis client failed health check"))
 	
-	_checkNode: (node) ->
+	_healthCheckClusterClient: (client, callback) ->
+		jobs = client.rclient.nodes("all").map (n) =>
+			(cb) => @_checkNode(n, cb)
+		async.parallel jobs, callback
+	
+	_checkNode: (node, _callback) ->
+		callback = (args...) ->
+			_callback(args...)
+			_callback = () ->
 		timer = setTimeout () ->
-			logger.error {err: new Error("Node timed out, reconnecting"), key: node.options.key}
-			# Discussion of application layer monitoring recommends this way of reconnecting at https://github.com/luin/ioredis/issues/275
-			node.stream.destroy()
+			error = new Error("ioredis node ping check timed out")
+			logger.error {err: error, key: node.options.key}, "node timed out"
+			callback(error)
 		, @HEARTBEAT_TIMEOUT
 		node.ping (err) ->
-			if !err?
 			clearTimeout timer
+			callback(err)

 class MultiClient
 	constructor: (@clients) ->
--- a/services/document-updater/app/coffee/RedisManager.coffee
+++ b/services/document-updater/app/coffee/RedisManager.coffee
@ -10,9 +10,9 @@ Errors = require "./Errors"
 # Make times easy to read
 minutes = 60 # seconds for Redis expire

-rclient.monitorTcpAndReconnect()
-
 module.exports = RedisManager =
+	rclient: rclient
+
 	putDocInMemory : (project_id, doc_id, docLines, version, _callback)->
 		timer = new metrics.Timer("redis.put-doc")
 		callback = (error) ->
--- a/services/document-updater/config/settings.defaults.coffee
+++ b/services/document-updater/config/settings.defaults.coffee
@ -20,31 +20,30 @@ module.exports =
 			port:"6379"
 			host:"localhost"
 			password:""
-		documentupdater:
-			# port:"6379"
-			# host:"localhost"
-			# password:""
-			# key_schema:
-			# 	blockingKey: ({doc_id}) -> "Blocking:#{doc_id}"
-			# 	docLines: ({doc_id}) -> "doclines:#{doc_id}"
-			# 	docOps: ({doc_id}) -> "DocOps:#{doc_id}"
-			# 	docVersion: ({doc_id}) -> "DocVersion:#{doc_id}"
-			# 	projectKey: ({doc_id}) -> "ProjectId:#{doc_id}"
-			# 	docsInProject: ({project_id}) -> "DocsIn:#{project_id}"
-			# To use Redis cluster, configure the backend as follows:
-			[{
+		documentupdater: [{
 			primary: true
-				cluster: [{
-					port: "7000"
+			port:"6379"
 			host:"localhost"
-				}]
+			password:""
 			key_schema:
-					blockingKey: ({doc_id}) -> "Blocking:{#{doc_id}}"
-					docLines: ({doc_id}) -> "doclines:{#{doc_id}}"
-					docOps: ({doc_id}) -> "DocOps:{#{doc_id}}"
-					docVersion: ({doc_id}) -> "DocVersion:{#{doc_id}}"
-					projectKey: ({doc_id}) -> "ProjectId:{#{doc_id}}"
-					docsInProject: ({project_id}) -> "DocsIn:{#{project_id}}"
+				blockingKey: ({doc_id}) -> "Blocking:#{doc_id}"
+				docLines: ({doc_id}) -> "doclines:#{doc_id}"
+				docOps: ({doc_id}) -> "DocOps:#{doc_id}"
+				docVersion: ({doc_id}) -> "DocVersion:#{doc_id}"
+				projectKey: ({doc_id}) -> "ProjectId:#{doc_id}"
+				docsInProject: ({project_id}) -> "DocsIn:#{project_id}"
+		# }, {
+		# 	cluster: [{
+		# 		port: "7000"
+		# 		host: "localhost"
+		# 	}]
+		# 	key_schema:
+		# 		blockingKey: ({doc_id}) -> "Blocking:{#{doc_id}}"
+		# 		docLines: ({doc_id}) -> "doclines:{#{doc_id}}"
+		# 		docOps: ({doc_id}) -> "DocOps:{#{doc_id}}"
+		# 		docVersion: ({doc_id}) -> "DocVersion:{#{doc_id}}"
+		# 		projectKey: ({doc_id}) -> "ProjectId:{#{doc_id}}"
+		# 		docsInProject: ({project_id}) -> "DocsIn:{#{project_id}}"
 		}]
 	
 	max_doc_length: 2 * 1024 * 1024 # 2mb
--- a/services/document-updater/test/unit/coffee/RedisBackend/RedisBackendTests.coffee
+++ b/services/document-updater/test/unit/coffee/RedisBackend/RedisBackendTests.coffee
@ -50,6 +50,7 @@ describe "RedisBackend", ->
 			"logger-sharelatex": @logger = { error: sinon.stub(), log: sinon.stub(), warn: sinon.stub() }
 			"redis-sharelatex": @redis =
 				createClient: sinon.stub().returns @rclient_redis = {}
+				activeHealthCheck: sinon.stub()
 			"ioredis": @ioredis =
 				Cluster: Cluster
 		@client = @RedisBackend.createClient()
@ -317,10 +318,40 @@ describe "RedisBackend", ->
 				.calledWith(@rclient_ioredis)
 				.should.equal true
 	
-	describe "_monitorCluster", ->
+	describe "_healthCheckNodeRedisClient", ->
+		beforeEach ->
+			@redis.activeHealthCheckRedis = sinon.stub().returns @healthCheck = {
+				isAlive: sinon.stub()
+			}
+		
+		describe "successfully", ->
+			beforeEach (done) ->
+				@healthCheck.isAlive.returns true
+				@redis_client = {}
+				@client._healthCheckNodeRedisClient(@redis_client, done)
+
+			it "should check the status of the node redis client", ->
+				@healthCheck.isAlive.called.should.equal true
+			
+			it "should only create one health check when called multiple times", (done) ->
+				@client._healthCheckNodeRedisClient @redis_client, () =>
+					@redis.activeHealthCheckRedis.calledOnce.should.equal true
+					@healthCheck.isAlive.calledTwice.should.equal true
+					done()
+		
+		describe "when failing", ->
+			beforeEach ->
+				@healthCheck.isAlive.returns false
+				@redis_client = {}
+			
+			it "should return an error", (done) ->
+				@client._healthCheckNodeRedisClient @redis_client, (error) ->
+					error.message.should.equal "node-redis client failed health check"
+					done()
+	
+	describe "_healthCheckClusterClient", ->
 		beforeEach ->
 			@client.HEARTBEAT_TIMEOUT = 10
-			@client.HEARTBEAT_INTERVAL = 100
 			@nodes = [{
 				options: key: "node-0"
 				stream: destroy: sinon.stub()
@ -330,37 +361,27 @@ describe "RedisBackend", ->
 			}]
 			@rclient_ioredis.nodes = sinon.stub().returns(@nodes)
 	
-		describe "successfully", ->
-			beforeEach ->
-				@nodes[0].ping = (cb) -> cb()
-				@nodes[1].ping = (cb) -> cb()
-				@client._monitorCluster(@rclient_ioredis)
+		describe "when both clients are successful", ->
+			beforeEach (done) ->
+				@nodes[0].ping = sinon.stub().yields()
+				@nodes[1].ping = sinon.stub().yields()
+				@client._healthCheckClusterClient({ rclient: @rclient_ioredis }, done)
 			
-			it "should get all nodes", ->
-				setTimeout () =>
+			it "should get all cluster nodes", ->
 				@rclient_ioredis.nodes
 					.calledWith("all")
 					.should.equal true
-				, 200
 			
-			it "should not reset the node connections", (done) ->
-				setTimeout () =>
-					@nodes[0].stream.destroy.called.should.equal false
-					@nodes[1].stream.destroy.called.should.equal false
-					done()
-				, 200
+			it "should ping each cluster node", ->
+				for node in @nodes
+					node.ping.called.should.equal true
 		
 		describe "when ping fails to a node", ->
 			beforeEach ->
 				@nodes[0].ping = (cb) -> cb()
 				@nodes[1].ping = (cb) -> # Just hang
-				@client._monitorCluster(@rclient_ioredis)
 			
-			it "should reset the failing node connection", (done) ->
-				setTimeout () =>
-					@nodes[0].stream.destroy.called.should.equal false
-					@nodes[1].stream.destroy.called.should.equal true
+			it "should return an error", ->
+				@client._healthCheckClusterClient { rclient: @rclient_ioredis }, (error) ->
+					error.message.should.equal "ioredis node ping check timed out"
 					done()
-				, 200
-			
-				
--- a/services/document-updater/test/unit/coffee/RedisManager/RedisManagerTests.coffee
+++ b/services/document-updater/test/unit/coffee/RedisManager/RedisManagerTests.coffee
@ -10,7 +10,6 @@ describe "RedisManager", ->
 		@rclient =
 			auth: () ->
 			exec: sinon.stub()
-			monitorTcpAndReconnect: () ->
 		@rclient.multi = () => @rclient
 		@RedisManager = SandboxedModule.require modulePath, requires:
 			"./RedisBackend":