Replace UTF-16 surrogate characters with 'replacement character'

In Javascript, characters are 16-bits wide. It does not understand surrogates as characters.

From Wikipedia (http://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane):
"The High Surrogates (U+D800–U+DBFF) and Low Surrogate (U+DC00–U+DFFF) codes are reserved
for encoding non-BMP characters in UTF-16 by using a pair of 16-bit codes: one High Surrogate
and one Low Surrogate. A single surrogate code point will never be assigned a character.""

The main offender seems to be \uD835 as a stand alone character, which would be the first
16-bit character of a blackboard bold character (http://www.fileformat.info/info/unicode/char/1d400/index.htm).
Something must be going on client side that is screwing up the encoding and splitting the
two 16-bit characters so that \uD835 is standalone.
This commit is contained in:
James Allen 2015-06-12 10:14:35 +01:00
parent d12341da1d
commit a3847d21d5
2 changed files with 48 additions and 9 deletions

View file

@ -54,6 +54,8 @@ module.exports = UpdateManager =
UpdateManager.applyUpdates project_id, doc_id, updates, callback
applyUpdates: (project_id, doc_id, updates, callback = (error) ->) ->
for update in updates
UpdateManager._sanitizeUpdate update
ShareJsUpdateManager.applyUpdates project_id, doc_id, updates, (error, updatedDocLines, version) ->
return callback(error) if error?
logger.log doc_id: doc_id, version: version, "updating doc via sharejs"
@ -76,4 +78,21 @@ module.exports = UpdateManager =
LockManager.releaseLock doc_id, (lock_error) ->
callback(original_error)
_sanitizeUpdate: (update) ->
# In Javascript, characters are 16-bits wide. It does not understand surrogates as characters.
#
# From Wikipedia (http://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane):
# "The High Surrogates (U+D800U+DBFF) and Low Surrogate (U+DC00U+DFFF) codes are reserved
# for encoding non-BMP characters in UTF-16 by using a pair of 16-bit codes: one High Surrogate
# and one Low Surrogate. A single surrogate code point will never be assigned a character.""
#
# The main offender seems to be \uD835 as a stand alone character, which would be the first
# 16-bit character of a blackboard bold character (http://www.fileformat.info/info/unicode/char/1d400/index.htm).
# Something must be going on client side that is screwing up the encoding and splitting the
# two 16-bit characters so that \uD835 is standalone.
for op in update.op or []
if op.i?
# Replace high and low surrogate characters with 'replacement character' (\uFFFD)
op.i = op.i.replace(/[\uD800-\uDFFF]/g, "\uFFFD")
return update

View file

@ -180,13 +180,21 @@ describe "UpdateManager", ->
describe "applyUpdates", ->
beforeEach ->
@updates = [{p: 1, t: "foo"}]
@updates = [{op: [{p: 42, i: "foo"}]}]
@updatedDocLines = ["updated", "lines"]
@version = 34
@ShareJsUpdateManager.applyUpdates = sinon.stub().callsArgWith(3, null, @updatedDocLines, @version)
@RedisManager.setDocument = sinon.stub().callsArg(3)
describe "normally", ->
beforeEach ->
@UpdateManager.applyUpdates @project_id, @doc_id, @updates, @callback
it "should apply the updates via ShareJS", ->
@ShareJsUpdateManager.applyUpdates
.calledWith(@project_id, @doc_id, @updates)
.should.equal true
it "should save the document", ->
@RedisManager.setDocument
.calledWith(@doc_id, @updatedDocLines, @version)
@ -195,4 +203,16 @@ describe "UpdateManager", ->
it "should call the callback", ->
@callback.called.should.equal true
describe "with UTF-16 surrogate pairs in the update", ->
beforeEach ->
@updates = [{op: [{p: 42, i: "\uD835\uDC00"}]}]
@UpdateManager.applyUpdates @project_id, @doc_id, @updates, @callback
it "should apply the update but with surrogate pairs removed", ->
@ShareJsUpdateManager.applyUpdates
.calledWith(@project_id, @doc_id, @updates)
.should.equal true
# \uFFFD is 'replacement character'
@updates[0].op[0].i.should.equal "\uFFFD\uFFFD"