From a3847d21d5631209ae01abaeeca7106292494927 Mon Sep 17 00:00:00 2001 From: James Allen Date: Fri, 12 Jun 2015 10:14:35 +0100 Subject: [PATCH 1/2] Replace UTF-16 surrogate characters with 'replacement character' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In Javascript, characters are 16-bits wide. It does not understand surrogates as characters. From Wikipedia (http://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane): "The High Surrogates (U+D800–U+DBFF) and Low Surrogate (U+DC00–U+DFFF) codes are reserved for encoding non-BMP characters in UTF-16 by using a pair of 16-bit codes: one High Surrogate and one Low Surrogate. A single surrogate code point will never be assigned a character."" The main offender seems to be \uD835 as a stand alone character, which would be the first 16-bit character of a blackboard bold character (http://www.fileformat.info/info/unicode/char/1d400/index.htm). Something must be going on client side that is screwing up the encoding and splitting the two 16-bit characters so that \uD835 is standalone. --- .../app/coffee/UpdateManager.coffee | 21 ++++++++++- .../UpdateManager/ApplyingUpdates.coffee | 36 ++++++++++++++----- 2 files changed, 48 insertions(+), 9 deletions(-) diff --git a/services/document-updater/app/coffee/UpdateManager.coffee b/services/document-updater/app/coffee/UpdateManager.coffee index a1db456457..feff628453 100644 --- a/services/document-updater/app/coffee/UpdateManager.coffee +++ b/services/document-updater/app/coffee/UpdateManager.coffee @@ -54,6 +54,8 @@ module.exports = UpdateManager = UpdateManager.applyUpdates project_id, doc_id, updates, callback applyUpdates: (project_id, doc_id, updates, callback = (error) ->) -> + for update in updates + UpdateManager._sanitizeUpdate update ShareJsUpdateManager.applyUpdates project_id, doc_id, updates, (error, updatedDocLines, version) -> return callback(error) if error? logger.log doc_id: doc_id, version: version, "updating doc via sharejs" @@ -75,5 +77,22 @@ module.exports = UpdateManager = _handleErrorInsideLock: (doc_id, original_error, callback = (error) ->) -> LockManager.releaseLock doc_id, (lock_error) -> callback(original_error) + + _sanitizeUpdate: (update) -> + # In Javascript, characters are 16-bits wide. It does not understand surrogates as characters. + # + # From Wikipedia (http://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane): + # "The High Surrogates (U+D800–U+DBFF) and Low Surrogate (U+DC00–U+DFFF) codes are reserved + # for encoding non-BMP characters in UTF-16 by using a pair of 16-bit codes: one High Surrogate + # and one Low Surrogate. A single surrogate code point will never be assigned a character."" + # + # The main offender seems to be \uD835 as a stand alone character, which would be the first + # 16-bit character of a blackboard bold character (http://www.fileformat.info/info/unicode/char/1d400/index.htm). + # Something must be going on client side that is screwing up the encoding and splitting the + # two 16-bit characters so that \uD835 is standalone. + for op in update.op or [] + if op.i? + # Replace high and low surrogate characters with 'replacement character' (\uFFFD) + op.i = op.i.replace(/[\uD800-\uDFFF]/g, "\uFFFD") + return update - diff --git a/services/document-updater/test/unit/coffee/UpdateManager/ApplyingUpdates.coffee b/services/document-updater/test/unit/coffee/UpdateManager/ApplyingUpdates.coffee index f421d545a7..70bab70c71 100644 --- a/services/document-updater/test/unit/coffee/UpdateManager/ApplyingUpdates.coffee +++ b/services/document-updater/test/unit/coffee/UpdateManager/ApplyingUpdates.coffee @@ -180,19 +180,39 @@ describe "UpdateManager", -> describe "applyUpdates", -> beforeEach -> - @updates = [{p: 1, t: "foo"}] + @updates = [{op: [{p: 42, i: "foo"}]}] @updatedDocLines = ["updated", "lines"] @version = 34 @ShareJsUpdateManager.applyUpdates = sinon.stub().callsArgWith(3, null, @updatedDocLines, @version) @RedisManager.setDocument = sinon.stub().callsArg(3) - @UpdateManager.applyUpdates @project_id, @doc_id, @updates, @callback + + describe "normally", -> + beforeEach -> + @UpdateManager.applyUpdates @project_id, @doc_id, @updates, @callback + + it "should apply the updates via ShareJS", -> + @ShareJsUpdateManager.applyUpdates + .calledWith(@project_id, @doc_id, @updates) + .should.equal true - it "should save the document", -> - @RedisManager.setDocument - .calledWith(@doc_id, @updatedDocLines, @version) - .should.equal true + it "should save the document", -> + @RedisManager.setDocument + .calledWith(@doc_id, @updatedDocLines, @version) + .should.equal true - it "should call the callback", -> - @callback.called.should.equal true + it "should call the callback", -> + @callback.called.should.equal true + describe "with UTF-16 surrogate pairs in the update", -> + beforeEach -> + @updates = [{op: [{p: 42, i: "\uD835\uDC00"}]}] + @UpdateManager.applyUpdates @project_id, @doc_id, @updates, @callback + + it "should apply the update but with surrogate pairs removed", -> + @ShareJsUpdateManager.applyUpdates + .calledWith(@project_id, @doc_id, @updates) + .should.equal true + + # \uFFFD is 'replacement character' + @updates[0].op[0].i.should.equal "\uFFFD\uFFFD" From 333591d0877f7feda21795e5ffe1eca70d72e8d6 Mon Sep 17 00:00:00 2001 From: James Allen Date: Fri, 12 Jun 2015 10:16:33 +0100 Subject: [PATCH 2/2] Extra null check --- services/document-updater/app/coffee/UpdateManager.coffee | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/document-updater/app/coffee/UpdateManager.coffee b/services/document-updater/app/coffee/UpdateManager.coffee index feff628453..8765c330ff 100644 --- a/services/document-updater/app/coffee/UpdateManager.coffee +++ b/services/document-updater/app/coffee/UpdateManager.coffee @@ -54,7 +54,7 @@ module.exports = UpdateManager = UpdateManager.applyUpdates project_id, doc_id, updates, callback applyUpdates: (project_id, doc_id, updates, callback = (error) ->) -> - for update in updates + for update in updates or [] UpdateManager._sanitizeUpdate update ShareJsUpdateManager.applyUpdates project_id, doc_id, updates, (error, updatedDocLines, version) -> return callback(error) if error?