diff --git a/index.js b/index.js index 4d5542c..7e87623 100644 --- a/index.js +++ b/index.js @@ -22,6 +22,86 @@ * @author fraser@google.com (Neil Fraser) */ +/** + * encodeURI, but able to handle lone (unpaired) surrogates, which encodeURI + * itself rejects with a "URI malformed" error. Lone surrogates can legitimately + * appear in JavaScript strings, and dmp's context-slicing can split a surrogate + * pair across a diff/patch segment boundary, so the serialization code must + * tolerate them. Lone surrogates are encoded as their WTF-8 byte sequence + * (ED A0-BF 80-BF) in percent notation; decodeURISurrogateSafe reverses this. + * For any string without lone surrogates the output is identical to encodeURI. + * @param {string} str The string to encode. + * @return {string} The encoded string. + * @private + */ +function encodeURISurrogateSafe(str) { + var hasLoneSurrogate = false; + for (var i = 0; i < str.length; i++) { + var code = str.charCodeAt(i); + if (code >= 0xD800 && code <= 0xDBFF) { + var next = str.charCodeAt(i + 1); + if (next >= 0xDC00 && next <= 0xDFFF) { i++; continue; } + hasLoneSurrogate = true; + break; + } else if (code >= 0xDC00 && code <= 0xDFFF) { + hasLoneSurrogate = true; + break; + } + } + if (!hasLoneSurrogate) { + return encodeURI(str); + } + + var out = ''; + for (var i = 0; i < str.length; i++) { + var code = str.charCodeAt(i); + if (code >= 0xD800 && code <= 0xDBFF) { + var next = str.charCodeAt(i + 1); + if (next >= 0xDC00 && next <= 0xDFFF) { + out += encodeURI(str.charAt(i) + str.charAt(i + 1)); + i++; + continue; + } + } + if (code >= 0xD800 && code <= 0xDFFF) { + var b0 = 0xE0 | (code >> 12); + var b1 = 0x80 | ((code >> 6) & 0x3F); + var b2 = 0x80 | (code & 0x3F); + out += '%' + b0.toString(16).toUpperCase() + + '%' + b1.toString(16).toUpperCase() + + '%' + b2.toString(16).toUpperCase(); + } else { + out += encodeURI(str.charAt(i)); + } + } + return out; +} + +/** + * decodeURI, but able to handle the WTF-8 surrogate-range percent sequences + * emitted by encodeURISurrogateSafe for lone surrogates. The surrogate range + * U+D800..U+DFFF is never produced by decodeURI from valid UTF-8 (decodeURI + * rejects such bytes as malformed), so recognizing these sequences here cannot + * collide with any legitimate decodeURI output. For any string not containing + * such sequences the output is identical to decodeURI. + * @param {string} str The string to decode. + * @return {string} The decoded string. + * @private + */ +function decodeURISurrogateSafe(str) { + var restored = str.replace( + /%ED%[AB][0-9A-Fa-f]%[89AB][0-9A-Fa-f]/gi, + function(seq) { + var b0 = parseInt(seq.substr(1, 2), 16); + var b1 = parseInt(seq.substr(4, 2), 16); + var b2 = parseInt(seq.substr(7, 2), 16); + var code = ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F); + return String.fromCharCode(code); + } + ); + return decodeURI(restored); +} + /** * Class containing the diff, match and patch methods. * @constructor @@ -1343,7 +1423,7 @@ diff_match_patch.prototype.diff_toDelta = function(diffs) { for (var x = 0; x < diffs.length; x++) { switch (diffs[x][0]) { case DIFF_INSERT: - text[x] = '+' + encodeURI(diffs[x][1]); + text[x] = '+' + encodeURISurrogateSafe(diffs[x][1]); break; case DIFF_DELETE: text[x] = '-' + diffs[x][1].length; @@ -1378,7 +1458,7 @@ diff_match_patch.prototype.diff_fromDelta = function(text1, delta) { case '+': try { diffs[diffsLength++] = - new diff_match_patch.Diff(DIFF_INSERT, decodeURI(param)); + new diff_match_patch.Diff(DIFF_INSERT, decodeURISurrogateSafe(param)); } catch (ex) { // Malformed URI sequence. throw new Error('Illegal escape in diff_fromDelta: ' + param); @@ -2119,7 +2199,7 @@ diff_match_patch.prototype.patch_fromText = function(textline) { while (textPointer < text.length) { var sign = text[textPointer].charAt(0); try { - var line = decodeURI(text[textPointer].substring(1)); + var line = decodeURISurrogateSafe(text[textPointer].substring(1)); } catch (ex) { // Malformed URI sequence. throw new Error('Illegal escape in patch_fromText: ' + line); @@ -2204,7 +2284,7 @@ diff_match_patch.patch_obj.prototype.toString = function() { op = ' '; break; } - text[x + 1] = op + encodeURI(this.diffs[x][1]) + '\n'; + text[x + 1] = op + encodeURISurrogateSafe(this.diffs[x][1]) + '\n'; } return text.join('').replace(/%20/g, ' '); }; diff --git a/test/index.js b/test/index.js index 6acbee1..0d1d21c 100644 --- a/test/index.js +++ b/test/index.js @@ -780,6 +780,32 @@ function testPatchToText() { assertEquals(strp, dmp.patch_toText(p)); } +function testLoneSurrogate() { + // Issue #22: context-slicing can leave a lone surrogate inside a patch + // segment even when both inputs are fully valid Unicode (emoji). encodeURI + // would throw URIError on that lone half. Serializing must not throw and the + // patch must round-trip back to the target. + var src = 'ab😀😀'; + var dst = 'b😀😀'; + var patches = dmp.patch_make(src, dst); + var text = dmp.patch_toText(patches); + assertEquals('string', typeof text); + var results = dmp.patch_apply(dmp.patch_fromText(text), src); + assertEquals(dst, results[0]); + assertTrue(results[1][0]); + + // The minimal repro on its own, exercising patch toString serialization. + var minimal = dmp.patch_toText(dmp.patch_make(src, dst)); + assertEquals('string', typeof minimal); + + // diff_toDelta must serialize a lone surrogate and round-trip via + // diff_fromDelta back to the same text. + var diffs = [[DIFF_INSERT, 'a\uD83D'], [DIFF_EQUAL, 'b']]; + var delta = dmp.diff_toDelta(diffs); + assertEquivalent(diffs, dmp.diff_fromDelta(dmp.diff_text1(diffs), delta)); + assertEquals('a\uD83Db', dmp.diff_text2(diffs)); +} + function testPatchAddContext() { dmp.Patch_Margin = 4; var p = dmp.patch_fromText('@@ -21,4 +21,10 @@\n-jump\n+somersault\n')[0]; @@ -1038,6 +1064,7 @@ var tests = [ 'testPatchObj', 'testPatchFromText', 'testPatchToText', + 'testLoneSurrogate', 'testPatchAddContext', 'testPatchMake', 'testPatchSplitMax',