Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 84 additions & 4 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,86 @@
* @author fraser@google.com (Neil Fraser)
*/

/**
* encodeURI, but able to handle lone (unpaired) surrogates, which encodeURI
* itself rejects with a "URI malformed" error. Lone surrogates can legitimately
* appear in JavaScript strings, and dmp's context-slicing can split a surrogate
* pair across a diff/patch segment boundary, so the serialization code must
* tolerate them. Lone surrogates are encoded as their WTF-8 byte sequence
* (ED A0-BF 80-BF) in percent notation; decodeURISurrogateSafe reverses this.
* For any string without lone surrogates the output is identical to encodeURI.
* @param {string} str The string to encode.
* @return {string} The encoded string.
* @private
*/
function encodeURISurrogateSafe(str) {
var hasLoneSurrogate = false;
for (var i = 0; i < str.length; i++) {
var code = str.charCodeAt(i);
if (code >= 0xD800 && code <= 0xDBFF) {
var next = str.charCodeAt(i + 1);
if (next >= 0xDC00 && next <= 0xDFFF) { i++; continue; }
hasLoneSurrogate = true;
break;
} else if (code >= 0xDC00 && code <= 0xDFFF) {
hasLoneSurrogate = true;
break;
}
}
if (!hasLoneSurrogate) {
return encodeURI(str);
}

var out = '';
for (var i = 0; i < str.length; i++) {
var code = str.charCodeAt(i);
if (code >= 0xD800 && code <= 0xDBFF) {
var next = str.charCodeAt(i + 1);
if (next >= 0xDC00 && next <= 0xDFFF) {
out += encodeURI(str.charAt(i) + str.charAt(i + 1));
i++;
continue;
}
}
if (code >= 0xD800 && code <= 0xDFFF) {
var b0 = 0xE0 | (code >> 12);
var b1 = 0x80 | ((code >> 6) & 0x3F);
var b2 = 0x80 | (code & 0x3F);
out += '%' + b0.toString(16).toUpperCase()
+ '%' + b1.toString(16).toUpperCase()
+ '%' + b2.toString(16).toUpperCase();
} else {
out += encodeURI(str.charAt(i));
}
}
return out;
}

/**
* decodeURI, but able to handle the WTF-8 surrogate-range percent sequences
* emitted by encodeURISurrogateSafe for lone surrogates. The surrogate range
* U+D800..U+DFFF is never produced by decodeURI from valid UTF-8 (decodeURI
* rejects such bytes as malformed), so recognizing these sequences here cannot
* collide with any legitimate decodeURI output. For any string not containing
* such sequences the output is identical to decodeURI.
* @param {string} str The string to decode.
* @return {string} The decoded string.
* @private
*/
function decodeURISurrogateSafe(str) {
var restored = str.replace(
/%ED%[AB][0-9A-Fa-f]%[89AB][0-9A-Fa-f]/gi,
function(seq) {
var b0 = parseInt(seq.substr(1, 2), 16);
var b1 = parseInt(seq.substr(4, 2), 16);
var b2 = parseInt(seq.substr(7, 2), 16);
var code = ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
return String.fromCharCode(code);
}
);
return decodeURI(restored);
}

/**
* Class containing the diff, match and patch methods.
* @constructor
Expand Down Expand Up @@ -1343,7 +1423,7 @@ diff_match_patch.prototype.diff_toDelta = function(diffs) {
for (var x = 0; x < diffs.length; x++) {
switch (diffs[x][0]) {
case DIFF_INSERT:
text[x] = '+' + encodeURI(diffs[x][1]);
text[x] = '+' + encodeURISurrogateSafe(diffs[x][1]);
break;
case DIFF_DELETE:
text[x] = '-' + diffs[x][1].length;
Expand Down Expand Up @@ -1378,7 +1458,7 @@ diff_match_patch.prototype.diff_fromDelta = function(text1, delta) {
case '+':
try {
diffs[diffsLength++] =
new diff_match_patch.Diff(DIFF_INSERT, decodeURI(param));
new diff_match_patch.Diff(DIFF_INSERT, decodeURISurrogateSafe(param));
} catch (ex) {
// Malformed URI sequence.
throw new Error('Illegal escape in diff_fromDelta: ' + param);
Expand Down Expand Up @@ -2119,7 +2199,7 @@ diff_match_patch.prototype.patch_fromText = function(textline) {
while (textPointer < text.length) {
var sign = text[textPointer].charAt(0);
try {
var line = decodeURI(text[textPointer].substring(1));
var line = decodeURISurrogateSafe(text[textPointer].substring(1));
} catch (ex) {
// Malformed URI sequence.
throw new Error('Illegal escape in patch_fromText: ' + line);
Expand Down Expand Up @@ -2204,7 +2284,7 @@ diff_match_patch.patch_obj.prototype.toString = function() {
op = ' ';
break;
}
text[x + 1] = op + encodeURI(this.diffs[x][1]) + '\n';
text[x + 1] = op + encodeURISurrogateSafe(this.diffs[x][1]) + '\n';
}
return text.join('').replace(/%20/g, ' ');
};
Expand Down
27 changes: 27 additions & 0 deletions test/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -780,6 +780,32 @@ function testPatchToText() {
assertEquals(strp, dmp.patch_toText(p));
}

function testLoneSurrogate() {
// Issue #22: context-slicing can leave a lone surrogate inside a patch
// segment even when both inputs are fully valid Unicode (emoji). encodeURI
// would throw URIError on that lone half. Serializing must not throw and the
// patch must round-trip back to the target.
var src = 'ab😀😀';
var dst = 'b😀😀';
var patches = dmp.patch_make(src, dst);
var text = dmp.patch_toText(patches);
assertEquals('string', typeof text);
var results = dmp.patch_apply(dmp.patch_fromText(text), src);
assertEquals(dst, results[0]);
assertTrue(results[1][0]);

// The minimal repro on its own, exercising patch toString serialization.
var minimal = dmp.patch_toText(dmp.patch_make(src, dst));
assertEquals('string', typeof minimal);

// diff_toDelta must serialize a lone surrogate and round-trip via
// diff_fromDelta back to the same text.
var diffs = [[DIFF_INSERT, 'a\uD83D'], [DIFF_EQUAL, 'b']];
var delta = dmp.diff_toDelta(diffs);
assertEquivalent(diffs, dmp.diff_fromDelta(dmp.diff_text1(diffs), delta));
assertEquals('a\uD83Db', dmp.diff_text2(diffs));
}

function testPatchAddContext() {
dmp.Patch_Margin = 4;
var p = dmp.patch_fromText('@@ -21,4 +21,10 @@\n-jump\n+somersault\n')[0];
Expand Down Expand Up @@ -1038,6 +1064,7 @@ var tests = [
'testPatchObj',
'testPatchFromText',
'testPatchToText',
'testLoneSurrogate',
'testPatchAddContext',
'testPatchMake',
'testPatchSplitMax',
Expand Down