utf8.js 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. /*! https://mths.be/utf8js v3.0.0 by @mathias */
  2. ; (function (root) {
  3. var stringFromCharCode = String.fromCharCode;
  4. // Taken from https://mths.be/punycode
  5. function ucs2decode(string) {
  6. var output = [];
  7. var counter = 0;
  8. var length = string.length;
  9. var value;
  10. var extra;
  11. while (counter < length) {
  12. value = string.charCodeAt(counter++);
  13. if (value >= 0xD800 && value <= 0xDBFF && counter < length) {
  14. // high surrogate, and there is a next character
  15. extra = string.charCodeAt(counter++);
  16. if ((extra & 0xFC00) == 0xDC00) { // low surrogate
  17. output.push(((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000);
  18. } else {
  19. // unmatched surrogate; only append this code unit, in case the next
  20. // code unit is the high surrogate of a surrogate pair
  21. output.push(value);
  22. counter--;
  23. }
  24. } else {
  25. output.push(value);
  26. }
  27. }
  28. return output;
  29. }
  30. // Taken from https://mths.be/punycode
  31. function ucs2encode(array) {
  32. var length = array.length;
  33. var index = -1;
  34. var value;
  35. var output = '';
  36. while (++index < length) {
  37. value = array[index];
  38. if (value > 0xFFFF) {
  39. value -= 0x10000;
  40. output += stringFromCharCode(value >>> 10 & 0x3FF | 0xD800);
  41. value = 0xDC00 | value & 0x3FF;
  42. }
  43. output += stringFromCharCode(value);
  44. }
  45. return output;
  46. }
  47. function checkScalarValue(codePoint) {
  48. if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
  49. throw Error(
  50. 'Lone surrogate U+' + codePoint.toString(16).toUpperCase() +
  51. ' is not a scalar value'
  52. );
  53. }
  54. }
  55. /*--------------------------------------------------------------------------*/
  56. function createByte(codePoint, shift) {
  57. return stringFromCharCode(((codePoint >> shift) & 0x3F) | 0x80);
  58. }
  59. function encodeCodePoint(codePoint) {
  60. if ((codePoint & 0xFFFFFF80) == 0) { // 1-byte sequence
  61. return stringFromCharCode(codePoint);
  62. }
  63. var symbol = '';
  64. if ((codePoint & 0xFFFFF800) == 0) { // 2-byte sequence
  65. symbol = stringFromCharCode(((codePoint >> 6) & 0x1F) | 0xC0);
  66. }
  67. else if ((codePoint & 0xFFFF0000) == 0) { // 3-byte sequence
  68. checkScalarValue(codePoint);
  69. symbol = stringFromCharCode(((codePoint >> 12) & 0x0F) | 0xE0);
  70. symbol += createByte(codePoint, 6);
  71. }
  72. else if ((codePoint & 0xFFE00000) == 0) { // 4-byte sequence
  73. symbol = stringFromCharCode(((codePoint >> 18) & 0x07) | 0xF0);
  74. symbol += createByte(codePoint, 12);
  75. symbol += createByte(codePoint, 6);
  76. }
  77. symbol += stringFromCharCode((codePoint & 0x3F) | 0x80);
  78. return symbol;
  79. }
  80. function utf8encode(string) {
  81. var codePoints = ucs2decode(string);
  82. var length = codePoints.length;
  83. var index = -1;
  84. var codePoint;
  85. var byteString = '';
  86. while (++index < length) {
  87. codePoint = codePoints[index];
  88. byteString += encodeCodePoint(codePoint);
  89. }
  90. return byteString;
  91. }
  92. /*--------------------------------------------------------------------------*/
  93. function readContinuationByte() {
  94. if (byteIndex >= byteCount) {
  95. throw Error('Invalid byte index');
  96. }
  97. var continuationByte = byteArray[byteIndex] & 0xFF;
  98. byteIndex++;
  99. if ((continuationByte & 0xC0) == 0x80) {
  100. return continuationByte & 0x3F;
  101. }
  102. // If we end up here, it’s not a continuation byte
  103. throw Error('Invalid continuation byte');
  104. }
  105. function decodeSymbol() {
  106. var byte1;
  107. var byte2;
  108. var byte3;
  109. var byte4;
  110. var codePoint;
  111. if (byteIndex > byteCount) {
  112. throw Error('Invalid byte index');
  113. }
  114. if (byteIndex == byteCount) {
  115. return false;
  116. }
  117. // Read first byte
  118. byte1 = byteArray[byteIndex] & 0xFF;
  119. byteIndex++;
  120. // 1-byte sequence (no continuation bytes)
  121. if ((byte1 & 0x80) == 0) {
  122. return byte1;
  123. }
  124. // 2-byte sequence
  125. if ((byte1 & 0xE0) == 0xC0) {
  126. byte2 = readContinuationByte();
  127. codePoint = ((byte1 & 0x1F) << 6) | byte2;
  128. if (codePoint >= 0x80) {
  129. return codePoint;
  130. } else {
  131. throw Error('Invalid continuation byte');
  132. }
  133. }
  134. // 3-byte sequence (may include unpaired surrogates)
  135. if ((byte1 & 0xF0) == 0xE0) {
  136. byte2 = readContinuationByte();
  137. byte3 = readContinuationByte();
  138. codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3;
  139. if (codePoint >= 0x0800) {
  140. checkScalarValue(codePoint);
  141. return codePoint;
  142. } else {
  143. throw Error('Invalid continuation byte');
  144. }
  145. }
  146. // 4-byte sequence
  147. if ((byte1 & 0xF8) == 0xF0) {
  148. byte2 = readContinuationByte();
  149. byte3 = readContinuationByte();
  150. byte4 = readContinuationByte();
  151. codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) |
  152. (byte3 << 0x06) | byte4;
  153. if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) {
  154. return codePoint;
  155. }
  156. }
  157. throw Error('Invalid UTF-8 detected');
  158. }
  159. var byteArray;
  160. var byteCount;
  161. var byteIndex;
  162. function utf8decode(byteString) {
  163. byteArray = ucs2decode(byteString);
  164. byteCount = byteArray.length;
  165. byteIndex = 0;
  166. var codePoints = [];
  167. var tmp;
  168. while ((tmp = decodeSymbol()) !== false) {
  169. codePoints.push(tmp);
  170. }
  171. return ucs2encode(codePoints);
  172. }
  173. /*--------------------------------------------------------------------------*/
  174. root.version = '3.0.0';
  175. root.encode = utf8encode;
  176. root.decode = utf8decode;
  177. }(typeof exports === 'undefined' ? this.utf8 = {} : exports));