deno.land / x / msgpack_javascript@v3.0.0-beta2 / src / utils / utf8.ts
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179
export function utf8Count(str: string): number { const strLength = str.length;
let byteLength = 0; let pos = 0; while (pos < strLength) { let value = str.charCodeAt(pos++);
if ((value & 0xffffff80) === 0) { // 1-byte byteLength++; continue; } else if ((value & 0xfffff800) === 0) { // 2-bytes byteLength += 2; } else { // handle surrogate pair if (value >= 0xd800 && value <= 0xdbff) { // high surrogate if (pos < strLength) { const extra = str.charCodeAt(pos); if ((extra & 0xfc00) === 0xdc00) { ++pos; value = ((value & 0x3ff) << 10) + (extra & 0x3ff) + 0x10000; } } }
if ((value & 0xffff0000) === 0) { // 3-byte byteLength += 3; } else { // 4-byte byteLength += 4; } } } return byteLength;}
export function utf8EncodeJs(str: string, output: Uint8Array, outputOffset: number): void { const strLength = str.length; let offset = outputOffset; let pos = 0; while (pos < strLength) { let value = str.charCodeAt(pos++);
if ((value & 0xffffff80) === 0) { // 1-byte output[offset++] = value; continue; } else if ((value & 0xfffff800) === 0) { // 2-bytes output[offset++] = ((value >> 6) & 0x1f) | 0xc0; } else { // handle surrogate pair if (value >= 0xd800 && value <= 0xdbff) { // high surrogate if (pos < strLength) { const extra = str.charCodeAt(pos); if ((extra & 0xfc00) === 0xdc00) { ++pos; value = ((value & 0x3ff) << 10) + (extra & 0x3ff) + 0x10000; } } }
if ((value & 0xffff0000) === 0) { // 3-byte output[offset++] = ((value >> 12) & 0x0f) | 0xe0; output[offset++] = ((value >> 6) & 0x3f) | 0x80; } else { // 4-byte output[offset++] = ((value >> 18) & 0x07) | 0xf0; output[offset++] = ((value >> 12) & 0x3f) | 0x80; output[offset++] = ((value >> 6) & 0x3f) | 0x80; } }
output[offset++] = (value & 0x3f) | 0x80; }}
// TextEncoder and TextDecoder are standardized in whatwg encoding:// https://encoding.spec.whatwg.org/// and available in all the modern browsers:// https://caniuse.com/textencoder// They are available in Node.js since v12 LTS as well:// https://nodejs.org/api/globals.html#textencoder
const sharedTextEncoder = new TextEncoder();
// This threshold should be determined by benchmarking, which might vary in engines and input data.// Run `npx ts-node benchmark/encode-string.ts` for details.const TEXT_ENCODER_THRESHOLD = 50;
export function utf8EncodeTE(str: string, output: Uint8Array, outputOffset: number): void { sharedTextEncoder.encodeInto(str, output.subarray(outputOffset));}
export function utf8Encode(str: string, output: Uint8Array, outputOffset: number): void { if (str.length > TEXT_ENCODER_THRESHOLD) { utf8EncodeTE(str, output, outputOffset); } else { utf8EncodeJs(str, output, outputOffset); }}
const CHUNK_SIZE = 0x1_000;
export function utf8DecodeJs(bytes: Uint8Array, inputOffset: number, byteLength: number): string { let offset = inputOffset; const end = offset + byteLength;
const units: Array<number> = []; let result = ""; while (offset < end) { const byte1 = bytes[offset++]!; if ((byte1 & 0x80) === 0) { // 1 byte units.push(byte1); } else if ((byte1 & 0xe0) === 0xc0) { // 2 bytes const byte2 = bytes[offset++]! & 0x3f; units.push(((byte1 & 0x1f) << 6) | byte2); } else if ((byte1 & 0xf0) === 0xe0) { // 3 bytes const byte2 = bytes[offset++]! & 0x3f; const byte3 = bytes[offset++]! & 0x3f; units.push(((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3); } else if ((byte1 & 0xf8) === 0xf0) { // 4 bytes const byte2 = bytes[offset++]! & 0x3f; const byte3 = bytes[offset++]! & 0x3f; const byte4 = bytes[offset++]! & 0x3f; let unit = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0c) | (byte3 << 0x06) | byte4; if (unit > 0xffff) { unit -= 0x10000; units.push(((unit >>> 10) & 0x3ff) | 0xd800); unit = 0xdc00 | (unit & 0x3ff); } units.push(unit); } else { units.push(byte1); }
if (units.length >= CHUNK_SIZE) { result += String.fromCharCode(...units); units.length = 0; } }
if (units.length > 0) { result += String.fromCharCode(...units); }
return result;}
const sharedTextDecoder = new TextDecoder();
// This threshold should be determined by benchmarking, which might vary in engines and input data.// Run `npx ts-node benchmark/decode-string.ts` for details.const TEXT_DECODER_THRESHOLD = 200;
export function utf8DecodeTD(bytes: Uint8Array, inputOffset: number, byteLength: number): string { const stringBytes = bytes.subarray(inputOffset, inputOffset + byteLength); return sharedTextDecoder.decode(stringBytes);}
export function utf8Decode(bytes: Uint8Array, inputOffset: number, byteLength: number): string { if (byteLength > TEXT_DECODER_THRESHOLD) { return utf8DecodeTD(bytes, inputOffset, byteLength); } else { return utf8DecodeJs(bytes, inputOffset, byteLength); }}
Version Info