Add in external health check rather than internal

This commit is contained in:
James Allen 2016-06-23 18:00:03 +01:00
parent 414ab5d6a9
commit da89ff7172
6 changed files with 113 additions and 75 deletions

View file

@ -60,6 +60,13 @@ app.get "/health_check/redis", (req, res, next)->
else else
res.send 500 res.send 500
app.get "/health_check/redis_cluster", (req, res, next) ->
RedisManager.rclient.healthCheck (error, alive) ->
if error?
logger.err {err: error}, "failed redis cluster health check"
res.send 500
else
res.send 200
app.use (error, req, res, next) -> app.use (error, req, res, next) ->
if error instanceof Errors.NotFoundError if error instanceof Errors.NotFoundError

View file

@ -5,7 +5,6 @@ logger = require "logger-sharelatex"
class Client class Client
constructor: (@clients) -> constructor: (@clients) ->
@HEARTBEAT_INTERVAL = 5000
@HEARTBEAT_TIMEOUT = 2000 @HEARTBEAT_TIMEOUT = 2000
multi: () -> multi: () ->
@ -18,28 +17,41 @@ class Client
} }
) )
monitorTcpAndReconnect: () -> healthCheck: (callback) ->
for client in @clients jobs = @clients.map (client) =>
(cb) => @_healthCheckClient(client, cb)
async.parallel jobs, callback
_healthCheckClient: (client, callback) ->
if client.driver == "ioredis" if client.driver == "ioredis"
@_monitorCluster(client.rclient) @_healthCheckClusterClient(client, callback)
else
@_healthCheckNodeRedisClient(client, callback)
_monitorCluster: (rclient) -> _healthCheckNodeRedisClient: (client, callback) ->
setInterval () => client.healthCheck ?= require("redis-sharelatex").activeHealthCheckRedis(Settings.redis.web)
# Nodes can come and go as the cluster moves/heals, so each heartbeat if client.healthCheck.isAlive()
# we ask again for the currently known nodes. return callback()
for node in rclient.nodes("all") else
@_checkNode(node) return callback(new Error("node-redis client failed health check"))
, @HEARTBEAT_INTERVAL
_checkNode: (node) -> _healthCheckClusterClient: (client, callback) ->
jobs = client.rclient.nodes("all").map (n) =>
(cb) => @_checkNode(n, cb)
async.parallel jobs, callback
_checkNode: (node, _callback) ->
callback = (args...) ->
_callback(args...)
_callback = () ->
timer = setTimeout () -> timer = setTimeout () ->
logger.error {err: new Error("Node timed out, reconnecting"), key: node.options.key} error = new Error("ioredis node ping check timed out")
# Discussion of application layer monitoring recommends this way of reconnecting at https://github.com/luin/ioredis/issues/275 logger.error {err: error, key: node.options.key}, "node timed out"
node.stream.destroy() callback(error)
, @HEARTBEAT_TIMEOUT , @HEARTBEAT_TIMEOUT
node.ping (err) -> node.ping (err) ->
if !err?
clearTimeout timer clearTimeout timer
callback(err)
class MultiClient class MultiClient
constructor: (@clients) -> constructor: (@clients) ->

View file

@ -10,9 +10,9 @@ Errors = require "./Errors"
# Make times easy to read # Make times easy to read
minutes = 60 # seconds for Redis expire minutes = 60 # seconds for Redis expire
rclient.monitorTcpAndReconnect()
module.exports = RedisManager = module.exports = RedisManager =
rclient: rclient
putDocInMemory : (project_id, doc_id, docLines, version, _callback)-> putDocInMemory : (project_id, doc_id, docLines, version, _callback)->
timer = new metrics.Timer("redis.put-doc") timer = new metrics.Timer("redis.put-doc")
callback = (error) -> callback = (error) ->

View file

@ -20,31 +20,30 @@ module.exports =
port:"6379" port:"6379"
host:"localhost" host:"localhost"
password:"" password:""
documentupdater: documentupdater: [{
# port:"6379"
# host:"localhost"
# password:""
# key_schema:
# blockingKey: ({doc_id}) -> "Blocking:#{doc_id}"
# docLines: ({doc_id}) -> "doclines:#{doc_id}"
# docOps: ({doc_id}) -> "DocOps:#{doc_id}"
# docVersion: ({doc_id}) -> "DocVersion:#{doc_id}"
# projectKey: ({doc_id}) -> "ProjectId:#{doc_id}"
# docsInProject: ({project_id}) -> "DocsIn:#{project_id}"
# To use Redis cluster, configure the backend as follows:
[{
primary: true primary: true
cluster: [{ port:"6379"
port: "7000" host:"localhost"
host: "localhost" password:""
}]
key_schema: key_schema:
blockingKey: ({doc_id}) -> "Blocking:{#{doc_id}}" blockingKey: ({doc_id}) -> "Blocking:#{doc_id}"
docLines: ({doc_id}) -> "doclines:{#{doc_id}}" docLines: ({doc_id}) -> "doclines:#{doc_id}"
docOps: ({doc_id}) -> "DocOps:{#{doc_id}}" docOps: ({doc_id}) -> "DocOps:#{doc_id}"
docVersion: ({doc_id}) -> "DocVersion:{#{doc_id}}" docVersion: ({doc_id}) -> "DocVersion:#{doc_id}"
projectKey: ({doc_id}) -> "ProjectId:{#{doc_id}}" projectKey: ({doc_id}) -> "ProjectId:#{doc_id}"
docsInProject: ({project_id}) -> "DocsIn:{#{project_id}}" docsInProject: ({project_id}) -> "DocsIn:#{project_id}"
# }, {
# cluster: [{
# port: "7000"
# host: "localhost"
# }]
# key_schema:
# blockingKey: ({doc_id}) -> "Blocking:{#{doc_id}}"
# docLines: ({doc_id}) -> "doclines:{#{doc_id}}"
# docOps: ({doc_id}) -> "DocOps:{#{doc_id}}"
# docVersion: ({doc_id}) -> "DocVersion:{#{doc_id}}"
# projectKey: ({doc_id}) -> "ProjectId:{#{doc_id}}"
# docsInProject: ({project_id}) -> "DocsIn:{#{project_id}}"
}] }]
max_doc_length: 2 * 1024 * 1024 # 2mb max_doc_length: 2 * 1024 * 1024 # 2mb

View file

@ -50,6 +50,7 @@ describe "RedisBackend", ->
"logger-sharelatex": @logger = { error: sinon.stub(), log: sinon.stub(), warn: sinon.stub() } "logger-sharelatex": @logger = { error: sinon.stub(), log: sinon.stub(), warn: sinon.stub() }
"redis-sharelatex": @redis = "redis-sharelatex": @redis =
createClient: sinon.stub().returns @rclient_redis = {} createClient: sinon.stub().returns @rclient_redis = {}
activeHealthCheck: sinon.stub()
"ioredis": @ioredis = "ioredis": @ioredis =
Cluster: Cluster Cluster: Cluster
@client = @RedisBackend.createClient() @client = @RedisBackend.createClient()
@ -317,10 +318,40 @@ describe "RedisBackend", ->
.calledWith(@rclient_ioredis) .calledWith(@rclient_ioredis)
.should.equal true .should.equal true
describe "_monitorCluster", -> describe "_healthCheckNodeRedisClient", ->
beforeEach ->
@redis.activeHealthCheckRedis = sinon.stub().returns @healthCheck = {
isAlive: sinon.stub()
}
describe "successfully", ->
beforeEach (done) ->
@healthCheck.isAlive.returns true
@redis_client = {}
@client._healthCheckNodeRedisClient(@redis_client, done)
it "should check the status of the node redis client", ->
@healthCheck.isAlive.called.should.equal true
it "should only create one health check when called multiple times", (done) ->
@client._healthCheckNodeRedisClient @redis_client, () =>
@redis.activeHealthCheckRedis.calledOnce.should.equal true
@healthCheck.isAlive.calledTwice.should.equal true
done()
describe "when failing", ->
beforeEach ->
@healthCheck.isAlive.returns false
@redis_client = {}
it "should return an error", (done) ->
@client._healthCheckNodeRedisClient @redis_client, (error) ->
error.message.should.equal "node-redis client failed health check"
done()
describe "_healthCheckClusterClient", ->
beforeEach -> beforeEach ->
@client.HEARTBEAT_TIMEOUT = 10 @client.HEARTBEAT_TIMEOUT = 10
@client.HEARTBEAT_INTERVAL = 100
@nodes = [{ @nodes = [{
options: key: "node-0" options: key: "node-0"
stream: destroy: sinon.stub() stream: destroy: sinon.stub()
@ -330,37 +361,27 @@ describe "RedisBackend", ->
}] }]
@rclient_ioredis.nodes = sinon.stub().returns(@nodes) @rclient_ioredis.nodes = sinon.stub().returns(@nodes)
describe "successfully", -> describe "when both clients are successful", ->
beforeEach -> beforeEach (done) ->
@nodes[0].ping = (cb) -> cb() @nodes[0].ping = sinon.stub().yields()
@nodes[1].ping = (cb) -> cb() @nodes[1].ping = sinon.stub().yields()
@client._monitorCluster(@rclient_ioredis) @client._healthCheckClusterClient({ rclient: @rclient_ioredis }, done)
it "should get all nodes", -> it "should get all cluster nodes", ->
setTimeout () =>
@rclient_ioredis.nodes @rclient_ioredis.nodes
.calledWith("all") .calledWith("all")
.should.equal true .should.equal true
, 200
it "should not reset the node connections", (done) -> it "should ping each cluster node", ->
setTimeout () => for node in @nodes
@nodes[0].stream.destroy.called.should.equal false node.ping.called.should.equal true
@nodes[1].stream.destroy.called.should.equal false
done()
, 200
describe "when ping fails to a node", -> describe "when ping fails to a node", ->
beforeEach -> beforeEach ->
@nodes[0].ping = (cb) -> cb() @nodes[0].ping = (cb) -> cb()
@nodes[1].ping = (cb) -> # Just hang @nodes[1].ping = (cb) -> # Just hang
@client._monitorCluster(@rclient_ioredis)
it "should reset the failing node connection", (done) -> it "should return an error", ->
setTimeout () => @client._healthCheckClusterClient { rclient: @rclient_ioredis }, (error) ->
@nodes[0].stream.destroy.called.should.equal false error.message.should.equal "ioredis node ping check timed out"
@nodes[1].stream.destroy.called.should.equal true
done() done()
, 200

View file

@ -10,7 +10,6 @@ describe "RedisManager", ->
@rclient = @rclient =
auth: () -> auth: () ->
exec: sinon.stub() exec: sinon.stub()
monitorTcpAndReconnect: () ->
@rclient.multi = () => @rclient @rclient.multi = () => @rclient
@RedisManager = SandboxedModule.require modulePath, requires: @RedisManager = SandboxedModule.require modulePath, requires:
"./RedisBackend": "./RedisBackend":