Merge pull request #105 from overleaf/bg-use-separate-redis-for-project-history

support migration of project history keys to separate redis instance
This commit is contained in:
Brian Gough 2020-04-17 11:26:49 +01:00 committed by GitHub
commit c2e165e0eb
8 changed files with 574 additions and 6 deletions

View file

@ -1,6 +1,7 @@
Settings = require('settings-sharelatex')
projectHistoryKeys = Settings.redis?.project_history?.key_schema
rclient = require("redis-sharelatex").createClient(Settings.redis.documentupdater)
#rclient = require("redis-sharelatex").createClient(Settings.redis.project_history)
rclient = require("./RedisMigrationManager").createClient(Settings.redis.project_history, Settings.redis.new_project_history)
logger = require('logger-sharelatex')
metrics = require('./Metrics')

View file

@ -0,0 +1,224 @@
logger = require "logger-sharelatex"
Settings = require "settings-sharelatex"
redis = require("redis-sharelatex")
LockManager = require("./LockManager")
metrics = require "./Metrics"
async = require("async")
# The aim is to migrate the project history queues
# ProjectHistory:Ops:{project_id} from the existing redis to a new redis.
#
# This has to work in conjunction with changes in project history.
#
# The basic principles are:
#
# - project history is modified to read from an 'old' and 'new' queue. It reads
# from the 'old' queue first, and when that queue is empty it reads from the
# 'new' queue.
# - docupdater will migrate to writing to the 'new' queue when the 'old' queue
# is empty.
#
# Some facts about the update process:
#
# - project history has a lock on the project-id, so each queue is processed in
# isolation
# - docupdaters take a lock on the doc_id but not the project_id, therefore
# multiple docupdaters can be appending to the queue for a project at the same
# time (provided they updates for individual docs are in order this is
# acceptable)
# - as we want to do this without shutting down the site, we have to take into
# account that different versions of the code will be running while deploys
# are in progress.
#
# The migration has to be carried out with the following constraint:
#
# - a docupdater should never write to the "old" queue when there are updates in
# the "new" queue (there is a strict ordering on the versions, new > old)
#
# The deployment process for docupdater will be
#
# - add a project-level lock to the queuing in docupdater
# - use a per-project migration flag to determine when to write to the new redis
# - set the migration flag for projects with an empty queue in the old redis
# - when all docupdaters respect the flag, make a new deploy which starts to set
# the flag
# - when all docupdaters are setting the flag (and writing to the new redis),
# finish the migration by writing all data to the new redis
#
# Final stage
#
# When all the queues are migrated, remove the migration code and return to a
# single client pointing at the new redis. Delete the
# ProjectHistory:MigrationKey:* entries in the new redis.
#
# Rollback
#
# Under the scheme above a project should only ever have data in the old redis
# or the new redis, but never both at the same time.
#
# Two scenarios:
#
# Hard rollback
#
# If we want to roll back to the old redis immediately, we need to get the data
# out of the new queues and back into the old queues, before appending to the
# old queues again. The actions to do this are:
#
# - close the site
# - revert docupdater so it only writes to the original redis (there will now
# be some data in the new redis for some projects which we need to recover)
# - run a script to move the new queues back into the old redis
# - revert project history to only read from the original redis
#
# Graceful rollback
#
# If we are prepared to keep the new redis running, but not add new projects to
# it we can do the following:
#
# - deploy all docupdaters to update from the "switch" phase into the
# "rollback" phase (projects in the new redis will continue to send data
# there, project not yet migrated will continue to go to the old redis)
# - deploy project history with the "old queue" pointing to the new redis and
# the "new queue" to the old redis to clear the new queue before processing
# the new queue (i.e. add a rollback:true property in new_project_history in
# the project-history settings via the environment variable
# MIGRATION_PHASE="rollback").
# - projects will now clear gradually from the new redis back to the old redis
# - get a list of all the projects in the new redis and flush them, which will
# cause the new queues to be cleared and the old redis to be used for those
# projects.
getProjectId = (key) ->
key.match(/\{([0-9a-f]{24})\}/)[1]
class Multi
constructor: (@migrationClient) ->
@command_list = []
@queueKey = null
rpush: (args...) ->
@queueKey = args[0]
@updates_count = args.length - 1
@command_list.push { command:'rpush', args: args}
setnx: (args...) ->
@command_list.push { command: 'setnx', args: args}
exec: (callback) ->
# decide which client to use
project_id = getProjectId(@queueKey)
# Put a lock around finding and updating the queue to avoid time-of-check to
# time-of-use problems. When running in the "switch" phase we need a lock to
# guarantee the order of operations. (Example: docupdater A sees an old
# queue at t=t0 and pushes onto it at t=t1, project history clears the queue
# between t0 and t1, and docupdater B sees the empty queue, sets the
# migration flag and pushes onto the new queue at t2. Without a lock it's
# possible to have t2 < t1 if docupdater A is slower than B - then there
# would be entries in the old and new queues, which we want to avoid.)
LockManager.getLock project_id, (error, lockValue) =>
return callback(error) if error?
releaseLock = (args...) =>
LockManager.releaseLock project_id, lockValue, (lockError) ->
return callback(lockError) if lockError?
callback(args...)
@migrationClient.findQueue @queueKey, (err, rclient) =>
return releaseLock(err) if err?
# add metric for updates
dest = (if rclient == @migrationClient.rclient_new then "new" else "old")
metrics.count "migration", @updates_count, 1, {status: "#{@migrationClient.migration_phase}-#{dest}"}
multi = rclient.multi()
for entry in @command_list
multi[entry.command](entry.args...)
multi.exec releaseLock
class MigrationClient
constructor: (@old_settings, @new_settings) ->
@rclient_old = redis.createClient(@old_settings)
@rclient_new = redis.createClient(@new_settings)
@new_key_schema = new_settings.key_schema
# check that migration phase is valid on startup
logger.warn {migration_phase: @getMigrationPhase()}, "running with RedisMigrationManager"
getMigrationPhase: () ->
@migration_phase = @new_settings.migration_phase # FIXME: allow setting migration phase while running for testing
throw new Error("invalid migration phase") unless @migration_phase in ['prepare', 'switch', 'rollback']
return @migration_phase
getMigrationStatus: (key, migrationKey, callback) ->
async.series [
(cb) => @rclient_new.exists migrationKey, cb
(cb) => @rclient_new.exists key, cb
(cb) => @rclient_old.exists key, cb
], (err, result) ->
return callback(err) if err?
migrationKeyExists = result[0] > 0
newQueueExists = result[1] > 0
oldQueueExists = result[2] > 0
callback(null, migrationKeyExists, newQueueExists, oldQueueExists)
findQueue: (key, callback) ->
project_id = getProjectId(key)
migrationKey = @new_key_schema.projectHistoryMigrationKey({project_id})
migration_phase = @getMigrationPhase() # allow setting migration phase while running for testing
@getMigrationStatus key, migrationKey, (err, migrationKeyExists, newQueueExists, oldQueueExists) =>
return callback(err) if err?
# In all cases, if the migration key exists we must always write to the
# new redis, unless we are rolling back.
if migration_phase is "prepare"
# in this phase we prepare for the switch, when some docupdaters will
# start setting the migration flag. We monitor the migration key and
# write to the new redis if the key is present, but we do not set the
# migration key. At this point no writes will be going into the new
# redis. When all the docupdaters are in the "prepare" phase we can
# begin deploying the "switch" phase.
if migrationKeyExists
logger.debug {project_id}, "using new client because migration key exists"
return callback(null, @rclient_new)
else
logger.debug {project_id}, "using old client because migration key does not exist"
return callback(null, @rclient_old)
else if migration_phase is "switch"
# As we deploy the "switch" phase new docupdaters will set the migration
# flag for projects which have an empty queue in the old redis, and
# write updates into the new redis. Existing docupdaters still in the
# "prepare" phase will pick up the migration flag and write new updates
# into the new redis when appropriate. When this deploy is complete
# writes will be going into the new redis for projects with an empty
# queue in the old redis. We have to remain in the switch phase until
# all projects are flushed from the old redis.
if migrationKeyExists
logger.debug {project_id}, "using new client because migration key exists"
return callback(null, @rclient_new)
else
if oldQueueExists
logger.debug {project_id}, "using old client because old queue exists"
return callback(null, @rclient_old)
else
@rclient_new.setnx migrationKey, "NEW", (err) =>
return callback(err) if err?
logger.debug {key: key}, "switching to new redis because old queue is empty"
return callback(null, @rclient_new)
else if migration_phase is "rollback"
# If we need to roll back gracefully we do the opposite of the "switch"
# phase. We use the new redis when the migration key is set and the
# queue exists in the new redis, but if the queue in the new redis is
# empty we delete the migration key and send further updates to the old
# redis.
if migrationKeyExists
if newQueueExists
logger.debug {project_id}, "using new client because migration key exists and new queue is present"
return callback(null, @rclient_new)
else
@rclient_new.del migrationKey, (err) =>
return callback(err) if err?
logger.debug {key: key}, "switching to old redis in rollback phase because new queue is empty"
return callback(null, @rclient_old)
else
logger.debug {project_id}, "using old client because migration key does not exist"
return callback(null, @rclient_old)
else
logger.error {key: key, migration_phase: migration_phase}, "unknown migration phase"
callback(new Error('invalid migration phase'))
multi: () ->
new Multi(@)
module.exports = RedisMigrationManager =
createClient: (args...) ->
new MigrationClient(args...)

View file

@ -45,6 +45,18 @@ module.exports =
projectHistoryOps: ({project_id}) -> "ProjectHistory:Ops:{#{project_id}}"
projectHistoryFirstOpTimestamp: ({project_id}) -> "ProjectHistory:FirstOpTimestamp:{#{project_id}}"
new_project_history:
port: process.env["NEW_HISTORY_REDIS_PORT"] or "6379"
host: process.env["NEW_HISTORY_REDIS_HOST"]
password: process.env["NEW_HISTORY_REDIS_PASSWORD"] or ""
key_schema:
projectHistoryOps: ({project_id}) -> "ProjectHistory:Ops:{#{project_id}}"
projectHistoryFirstOpTimestamp: ({project_id}) -> "ProjectHistory:FirstOpTimestamp:{#{project_id}}"
projectHistoryMigrationKey: ({project_id}) -> "ProjectHistory:MigrationKey:{#{project_id}}"
migration_phase: process.env["PROJECT_HISTORY_MIGRATION_PHASE"] or "prepare"
redisOptions:
keepAlive: 100
lock:
port: process.env["LOCK_REDIS_PORT"] or process.env["REDIS_PORT"] or "6379"
host: process.env["LOCK_REDIS_HOST"] or process.env["REDIS_HOST"] or "localhost"

View file

@ -13,13 +13,13 @@ services:
environment:
NODE_ENV: test
test_acceptance:
build: .
image: ci/$PROJECT_NAME:$BRANCH_NAME-$BUILD_NUMBER
environment:
ELASTIC_SEARCH_DSN: es:9200
REDIS_HOST: redis
NEW_HISTORY_REDIS_HOST: new_redis
MONGO_HOST: mongo
POSTGRES_HOST: postgres
MOCHA_GREP: ${MOCHA_GREP}
@ -29,10 +29,11 @@ services:
condition: service_healthy
redis:
condition: service_healthy
new_redis:
condition: service_healthy
user: node
command: npm run test:acceptance:_run
tar:
build: .
image: ci/$PROJECT_NAME:$BRANCH_NAME-$BUILD_NUMBER
@ -43,5 +44,8 @@ services:
redis:
image: redis
new_redis:
image: redis
mongo:
image: mongo:3.6

View file

@ -25,6 +25,7 @@ services:
environment:
ELASTIC_SEARCH_DSN: es:9200
REDIS_HOST: redis
NEW_HISTORY_REDIS_HOST: new_redis
MONGO_HOST: mongo
POSTGRES_HOST: postgres
MOCHA_GREP: ${MOCHA_GREP}
@ -36,11 +37,15 @@ services:
condition: service_healthy
redis:
condition: service_healthy
new_redis:
condition: service_healthy
command: npm run test:acceptance
redis:
image: redis
new_redis:
image: redis
mongo:
image: mongo:3.6

View file

@ -0,0 +1,320 @@
sinon = require "sinon"
chai = require("chai")
chai.should()
expect = chai.expect
async = require "async"
Settings = require('settings-sharelatex')
rclient_old = require("redis-sharelatex").createClient(Settings.redis.project_history)
rclient_new = require("redis-sharelatex").createClient(Settings.redis.new_project_history)
rclient_du = require("redis-sharelatex").createClient(Settings.redis.documentupdater)
Keys = Settings.redis.documentupdater.key_schema
HistoryKeys = Settings.redis.history.key_schema
ProjectHistoryKeys = Settings.redis.project_history.key_schema
NewProjectHistoryKeys = Settings.redis.new_project_history.key_schema
MockTrackChangesApi = require "./helpers/MockTrackChangesApi"
MockWebApi = require "./helpers/MockWebApi"
DocUpdaterClient = require "./helpers/DocUpdaterClient"
DocUpdaterApp = require "./helpers/DocUpdaterApp"
describe "RedisMigrationManager", ->
before (done) ->
@lines = ["one", "two", "three"]
@version = 42
@update =
doc: @doc_id
op: [{
i: "one and a half\n"
p: 4
}]
v: @version
DocUpdaterApp.ensureRunning(done)
describe "when the migration phase is 'prepare' (default)", ->
describe "when there is no migration flag", ->
before (done) ->
[@project_id, @doc_id] = [DocUpdaterClient.randomId(), DocUpdaterClient.randomId()]
MockWebApi.insertDoc @project_id, @doc_id, {lines: @lines, version: @version}
DocUpdaterClient.preloadDoc @project_id, @doc_id, (error) =>
throw error if error?
sinon.spy MockWebApi, "getDocument"
DocUpdaterClient.sendUpdate @project_id, @doc_id, @update, (error) ->
throw error if error?
setTimeout done, 200
return null
after ->
MockWebApi.getDocument.restore()
it "should push the applied updates to old redis", (done) ->
rclient_old.lrange ProjectHistoryKeys.projectHistoryOps({@project_id}), 0, -1, (error, updates) =>
JSON.parse(updates[0]).op.should.deep.equal @update.op
done()
return null
it "should not push the applied updates to the new redis", (done) ->
rclient_new.exists ProjectHistoryKeys.projectHistoryOps({@project_id}), (error, result) =>
result.should.equal 0
done()
return null
it "should not set the migration flag for the project", (done) ->
rclient_new.exists NewProjectHistoryKeys.projectHistoryMigrationKey({@project_id}), (error, result) =>
result.should.equal 0
done()
return null
describe "when the migration flag is set for the project", ->
before (done) ->
[@project_id, @doc_id] = [DocUpdaterClient.randomId(), DocUpdaterClient.randomId()]
rclient_new.set NewProjectHistoryKeys.projectHistoryMigrationKey({@project_id}), '1', (error) =>
throw error if error?
MockWebApi.insertDoc @project_id, @doc_id, {lines: @lines, version: @version}
DocUpdaterClient.preloadDoc @project_id, @doc_id, (error) =>
throw error if error?
sinon.spy MockWebApi, "getDocument"
DocUpdaterClient.sendUpdate @project_id, @doc_id, @update, (error) ->
throw error if error?
setTimeout done, 200
return null
after (done) ->
MockWebApi.getDocument.restore()
rclient_new.del NewProjectHistoryKeys.projectHistoryMigrationKey({@project_id}), done
return null
it "should push the applied updates to the new redis", (done) ->
rclient_new.lrange ProjectHistoryKeys.projectHistoryOps({@project_id}), 0, -1, (error, updates) =>
JSON.parse(updates[0]).op.should.deep.equal @update.op
done()
return null
it "should not push the applied updates to the old redis", (done) ->
rclient_old.exists ProjectHistoryKeys.projectHistoryOps({@project_id}), (error, result) =>
result.should.equal 0
done()
return null
it "should keep the migration flag for the project", (done) ->
rclient_new.exists NewProjectHistoryKeys.projectHistoryMigrationKey({@project_id}), (error, result) =>
result.should.equal 1
done()
return null
describe "when the migration phase is 'switch'", ->
before ->
Settings.redis.new_project_history.migration_phase = 'switch'
describe "when the old queue is empty", ->
before (done) ->
[@project_id, @doc_id] = [DocUpdaterClient.randomId(), DocUpdaterClient.randomId()]
MockWebApi.insertDoc @project_id, @doc_id, {lines: @lines, version: @version}
DocUpdaterClient.preloadDoc @project_id, @doc_id, (error) =>
throw error if error?
sinon.spy MockWebApi, "getDocument"
DocUpdaterClient.sendUpdate @project_id, @doc_id, @update, (error) ->
throw error if error?
setTimeout done, 200
return null
after ->
MockWebApi.getDocument.restore()
it "should push the applied updates to the new redis", (done) ->
rclient_new.lrange ProjectHistoryKeys.projectHistoryOps({@project_id}), 0, -1, (error, updates) =>
JSON.parse(updates[0]).op.should.deep.equal @update.op
done()
return null
it "should not push the applied updates to the old redis", (done) ->
rclient_old.exists ProjectHistoryKeys.projectHistoryOps({@project_id}), (error, result) =>
result.should.equal 0
done()
return null
it "should set the migration flag for the project", (done) ->
rclient_new.get NewProjectHistoryKeys.projectHistoryMigrationKey({@project_id}), (error, result) =>
result.should.equal "NEW"
done()
return null
describe "when the old queue is not empty", ->
before (done) ->
[@project_id, @doc_id] = [DocUpdaterClient.randomId(), DocUpdaterClient.randomId()]
MockWebApi.insertDoc @project_id, @doc_id, {lines: @lines, version: @version}
DocUpdaterClient.preloadDoc @project_id, @doc_id, (error) =>
throw error if error?
sinon.spy MockWebApi, "getDocument"
rclient_old.rpush ProjectHistoryKeys.projectHistoryOps({@project_id}), JSON.stringify({op: "dummy-op"}), (error) =>
throw error if error?
DocUpdaterClient.sendUpdate @project_id, @doc_id, @update, (error) ->
throw error if error?
setTimeout done, 200
return null
after ->
MockWebApi.getDocument.restore()
it "should push the applied updates to the old redis", (done) ->
rclient_old.lrange ProjectHistoryKeys.projectHistoryOps({@project_id}), 0, -1, (error, updates) =>
JSON.parse(updates[0]).op.should.deep.equal "dummy-op"
JSON.parse(updates[1]).op.should.deep.equal @update.op
done()
return null
it "should not push the applied updates to the new redis", (done) ->
rclient_new.exists ProjectHistoryKeys.projectHistoryOps({@project_id}), (error, result) =>
result.should.equal 0
done()
return null
it "should not set the migration flag for the project", (done) ->
rclient_new.exists NewProjectHistoryKeys.projectHistoryMigrationKey({@project_id}), (error, result) =>
result.should.equal 0
done()
return null
describe "when the migration flag is set for the project", ->
before (done) ->
[@project_id, @doc_id] = [DocUpdaterClient.randomId(), DocUpdaterClient.randomId()]
rclient_new.set NewProjectHistoryKeys.projectHistoryMigrationKey({@project_id}), '1', (error) =>
throw error if error?
MockWebApi.insertDoc @project_id, @doc_id, {lines: @lines, version: @version}
DocUpdaterClient.preloadDoc @project_id, @doc_id, (error) =>
throw error if error?
sinon.spy MockWebApi, "getDocument"
DocUpdaterClient.sendUpdate @project_id, @doc_id, @update, (error) ->
throw error if error?
setTimeout done, 200
return null
after (done) ->
MockWebApi.getDocument.restore()
rclient_new.del NewProjectHistoryKeys.projectHistoryMigrationKey({@project_id}), done
return null
it "should push the applied updates to the new redis", (done) ->
rclient_new.lrange ProjectHistoryKeys.projectHistoryOps({@project_id}), 0, -1, (error, updates) =>
JSON.parse(updates[0]).op.should.deep.equal @update.op
done()
return null
it "should not push the applied updates to the old redis", (done) ->
rclient_old.exists ProjectHistoryKeys.projectHistoryOps({@project_id}), (error, result) =>
result.should.equal 0
done()
return null
it "should keep the migration flag for the project", (done) ->
rclient_new.exists NewProjectHistoryKeys.projectHistoryMigrationKey({@project_id}), (error, result) =>
result.should.equal 1
done()
return null
describe "when the migration phase is 'rollback'", ->
before ->
Settings.redis.new_project_history.migration_phase = 'rollback'
describe "when the old queue is empty", ->
before (done) ->
[@project_id, @doc_id] = [DocUpdaterClient.randomId(), DocUpdaterClient.randomId()]
MockWebApi.insertDoc @project_id, @doc_id, {lines: @lines, version: @version}
DocUpdaterClient.preloadDoc @project_id, @doc_id, (error) =>
throw error if error?
sinon.spy MockWebApi, "getDocument"
DocUpdaterClient.sendUpdate @project_id, @doc_id, @update, (error) ->
throw error if error?
setTimeout done, 200
return null
after ->
MockWebApi.getDocument.restore()
it "should push the applied updates to the old redis", (done) ->
rclient_old.lrange ProjectHistoryKeys.projectHistoryOps({@project_id}), 0, -1, (error, updates) =>
JSON.parse(updates[0]).op.should.deep.equal @update.op
done()
return null
it "should not push the applied updates to the new redis", (done) ->
rclient_new.exists ProjectHistoryKeys.projectHistoryOps({@project_id}), (error, result) =>
result.should.equal 0
done()
return null
describe "when the new queue is not empty", ->
before (done) ->
[@project_id, @doc_id] = [DocUpdaterClient.randomId(), DocUpdaterClient.randomId()]
MockWebApi.insertDoc @project_id, @doc_id, {lines: @lines, version: @version}
DocUpdaterClient.preloadDoc @project_id, @doc_id, (error) =>
throw error if error?
sinon.spy MockWebApi, "getDocument"
rclient_new.rpush ProjectHistoryKeys.projectHistoryOps({@project_id}), JSON.stringify({op: "dummy-op"}), (error) =>
throw error if error?
DocUpdaterClient.sendUpdate @project_id, @doc_id, @update, (error) ->
throw error if error?
setTimeout done, 200
return null
after ->
MockWebApi.getDocument.restore()
it "should push the applied updates to the old redis", (done) ->
rclient_old.lrange ProjectHistoryKeys.projectHistoryOps({@project_id}), 0, -1, (error, updates) =>
JSON.parse(updates[0]).op.should.deep.equal @update.op
done()
return null
it "should not push the applied updates to the new redis", (done) ->
rclient_new.lrange ProjectHistoryKeys.projectHistoryOps({@project_id}), 0, -1, (error, updates) =>
JSON.parse(updates[0]).op.should.deep.equal "dummy-op"
updates.length.should.equal 1
done()
return null
describe "when the migration flag is set for the project", ->
before (done) ->
[@project_id, @doc_id] = [DocUpdaterClient.randomId(), DocUpdaterClient.randomId()]
rclient_new.set NewProjectHistoryKeys.projectHistoryMigrationKey({@project_id}), '1', (error) =>
throw error if error?
MockWebApi.insertDoc @project_id, @doc_id, {lines: @lines, version: @version}
DocUpdaterClient.preloadDoc @project_id, @doc_id, (error) =>
throw error if error?
sinon.spy MockWebApi, "getDocument"
DocUpdaterClient.sendUpdate @project_id, @doc_id, @update, (error) ->
throw error if error?
setTimeout done, 200
return null
after (done) ->
MockWebApi.getDocument.restore()
rclient_new.del NewProjectHistoryKeys.projectHistoryMigrationKey({@project_id}), done
return null
it "should push the applied updates to the old redis", (done) ->
rclient_old.lrange ProjectHistoryKeys.projectHistoryOps({@project_id}), 0, -1, (error, updates) =>
JSON.parse(updates[0]).op.should.deep.equal @update.op
done()
return null
it "should not push the applied updates to the new redis", (done) ->
rclient_new.exists ProjectHistoryKeys.projectHistoryOps({@project_id}), (error, result) =>
result.should.equal 0
done()
return null
it "should delete the migration flag for the project", (done) ->
rclient_new.exists NewProjectHistoryKeys.projectHistoryMigrationKey({@project_id}), (error, result) =>
result.should.equal 0
done()
return null

View file

@ -13,8 +13,8 @@ module.exports =
else
@initing = true
@callbacks.push callback
app.listen 3003, "localhost", (error) =>
app.listen 3003, "localhost", (error) =>
throw error if error?
@running = true
for callback in @callbacks
callback()
callback()

View file

@ -24,6 +24,8 @@ describe "ProjectHistoryRedisManager", ->
}
"redis-sharelatex":
createClient: () => @rclient
"./RedisMigrationManager":
createClient: () => @rclient
"logger-sharelatex":
log:->
"./Metrics": @metrics = { summary: sinon.stub()}