sdk/lib/convert/utf.dart - sdk.git - Git at Google

 // Copyright (c) 2013, the Dart project authors.  Please see the AUTHORS file
 // for details. All rights reserved. Use of this source code is governed by a
 // BSD-style license that can be found in the LICENSE file.

 part of dart.convert;

 /**
  * A [Utf8Encoder] converts strings to their UTF-8 code units (a list of
  * unsigned 8-bit integers).
  */
 class Utf8Encoder extends Converter<String, List<int>> {
   /**
    * Converts [string] to its UTF-8 code units (a list of
    * unsigned 8-bit integers).
    */
   List<int> convert(String string) => OLD_UTF_LIB.encodeUtf8(string);
 }

 /**
  * A [Utf8Decoder] converts UTF-8 code units (lists of unsigned 8-bit integers)
  * to a string.
  */
 class Utf8Decoder extends Converter<List<int>, String> {
   final bool _allowMalformed;

   /**
    * Instantiates a new [Utf8Decoder].
    *
    * The optional [allowMalformed] argument defines how [convert] deals
    * with invalid or unterminated character sequences.
    *
    * If it is `true` [convert] replaces invalid (or unterminated) character
    * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise
    * it throws a [FormatException].
    */
   Utf8Decoder({ bool allowMalformed: false })
       : this._allowMalformed = allowMalformed;

   /**
    * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the
    * corresponding string.
    */
   String convert(List<int> codeUnits) {
     StringBuffer buffer = new StringBuffer();
     _Utf8Decoder decoder = new _Utf8Decoder(_allowMalformed);
     decoder.convert(codeUnits, 0, codeUnits.length, buffer);
     decoder.close(buffer);
     return buffer.toString();
   }
 }

 // UTF-8 constants.
 const int _ONE_BYTE_LIMIT = 0x7f;   // 7 bytes
 const int _TWO_BYTE_LIMIT = 0x7ff;  // 11 bytes
 const int _THREE_BYTE_LIMIT = 0xffff;  // 16 bytes
 const int _FOUR_BYTE_LIMIT = 0x10ffff;  // 21 bytes, truncated to Unicode max.

 // UTF-16 constants.
 const int _SURROGATE_MASK = 0xF800;
 const int _SURROGATE_TAG_MASK = 0xFC00;
 const int _SURROGATE_VALUE_MASK = 0x3FF;
 const int _LEAD_SURROGATE_MIN = 0xD800;
 const int _TAIL_SURROGATE_MIN = 0xDC00;

 const int _REPLACEMENT_CHARACTER = 0xFFFD;
 const int _BOM_CHARACTER = 0xFEFF;

 bool _isSurrogate(int codeUnit) =>
     (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN;
 bool _isLeadSurrogate(int codeUnit) =>
     (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN;
 bool _isTailSurrogate(int codeUnit) =>
     (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN;
 int _combineSurrogatePair(int lead, int tail) =>
     0x10000 | ((lead & _SURROGATE_VALUE_MASK) << 10)
             | (tail & _SURROGATE_VALUE_MASK);


 /**
  * Decodes UTF-8.
  *
  * The decoder handles chunked input.
  */
 // TODO(floitsch): make this class public.
 class _Utf8Decoder {
   final bool _allowMalformed;
   bool _isFirstCharacter = true;
   int _value = 0;
   int _expectedUnits = 0;
   int _extraUnits = 0;

   _Utf8Decoder(this._allowMalformed);

   bool get hasPartialInput => _expectedUnits > 0;

   // Limits of one through four byte encodings.
   static const List<int> _LIMITS = const <int>[
       _ONE_BYTE_LIMIT,
       _TWO_BYTE_LIMIT,
       _THREE_BYTE_LIMIT,
       _FOUR_BYTE_LIMIT ];

   void close(StringSink sink) {
     if (hasPartialInput) {
       if (!_allowMalformed) {
         throw new FormatException("Unfinished UTF-8 octet sequence");
       }
       sink.writeCharCode(_REPLACEMENT_CHARACTER);
     }
   }

   void convert(List<int> codeUnits, int startIndex, int endIndex,
                StringSink sink) {
     int value = _value;
     int expectedUnits = _expectedUnits;
     int extraUnits = _extraUnits;
     _value = 0;
     _expectedUnits = 0;
     _extraUnits = 0;

     int i = startIndex;
     loop: while (true) {
       multibyte: if (expectedUnits > 0) {
         do {
           if (i == endIndex) {
             break loop;
           }
           int unit = codeUnits[i];
           if ((unit & 0xC0) != 0x80) {
             expectedUnits = 0;
             if (!_allowMalformed) {
               throw new FormatException(
                   "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");
             }
             _isFirstCharacter = false;
             sink.writeCharCode(_REPLACEMENT_CHARACTER);
             break multibyte;
           } else {
             value = (value << 6) | (unit & 0x3f);
             expectedUnits--;
             i++;
           }
         } while (expectedUnits > 0);
         if (value <= _LIMITS[extraUnits - 1]) {
           // Overly long encoding. The value could be encoded with a shorter
           // encoding.
           if (!_allowMalformed) {
             throw new FormatException(
                 "Overlong encoding of 0x${value.toRadixString(16)}");
           }
           expectedUnits = extraUnits = 0;
           value = _REPLACEMENT_CHARACTER;
         }
         if (value > _FOUR_BYTE_LIMIT) {
           if (!_allowMalformed) {
             throw new FormatException("Character outside valid Unicode range: "
                                       "0x${value.toRadixString(16)}");
           }
           value = _REPLACEMENT_CHARACTER;
         }
         if (!_isFirstCharacter || value != _BOM_CHARACTER) {
           sink.writeCharCode(value);
         }
         _isFirstCharacter = false;
       }

       while (i < endIndex) {
         int unit = codeUnits[i++];
         if (unit <= _ONE_BYTE_LIMIT) {
           _isFirstCharacter = false;
           sink.writeCharCode(unit);
         } else {
           if ((unit & 0xE0) == 0xC0) {
             value = unit & 0x1F;
             expectedUnits = extraUnits = 1;
             continue loop;
           }
           if ((unit & 0xF0) == 0xE0) {
             value = unit & 0x0F;
             expectedUnits = extraUnits = 2;
             continue loop;
           }
           // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences.
           if ((unit & 0xF8) == 0xF0 && unit < 0xF5) {
             value = unit & 0x07;
             expectedUnits = extraUnits = 3;
             continue loop;
           }
           if (!_allowMalformed) {
             throw new FormatException(
                 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");
           }
           value = _REPLACEMENT_CHARACTER;
           expectedUnits = extraUnits = 0;
           _isFirstCharacter = false;
           sink.writeCharCode(value);
         }
       }
       break loop;
     }
     if (expectedUnits > 0) {
       _value = value;
       _expectedUnits = expectedUnits;
       _extraUnits = extraUnits;
     }
   }
 }
	// Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file
	// for details. All rights reserved. Use of this source code is governed by a
	// BSD-style license that can be found in the LICENSE file.

	part of dart.convert;

	/**
	* A [Utf8Encoder] converts strings to their UTF-8 code units (a list of
	* unsigned 8-bit integers).
	*/
	class Utf8Encoder extends Converter<String, List<int>> {
	/**
	* Converts [string] to its UTF-8 code units (a list of
	* unsigned 8-bit integers).
	*/
	List<int> convert(String string) => OLD_UTF_LIB.encodeUtf8(string);
	}

	/**
	* A [Utf8Decoder] converts UTF-8 code units (lists of unsigned 8-bit integers)
	* to a string.
	*/
	class Utf8Decoder extends Converter<List<int>, String> {
	final bool _allowMalformed;

	/**
	* Instantiates a new [Utf8Decoder].
	*
	* The optional [allowMalformed] argument defines how [convert] deals
	* with invalid or unterminated character sequences.
	*
	* If it is `true` [convert] replaces invalid (or unterminated) character
	* sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise
	* it throws a [FormatException].
	*/
	Utf8Decoder({ bool allowMalformed: false })
	: this._allowMalformed = allowMalformed;

	/**
	* Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the
	* corresponding string.
	*/
	String convert(List<int> codeUnits) {
	StringBuffer buffer = new StringBuffer();
	_Utf8Decoder decoder = new _Utf8Decoder(_allowMalformed);
	decoder.convert(codeUnits, 0, codeUnits.length, buffer);
	decoder.close(buffer);
	return buffer.toString();
	}
	}

	// UTF-8 constants.
	const int _ONE_BYTE_LIMIT = 0x7f; // 7 bytes
	const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bytes
	const int _THREE_BYTE_LIMIT = 0xffff; // 16 bytes
	const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bytes, truncated to Unicode max.

	// UTF-16 constants.
	const int _SURROGATE_MASK = 0xF800;
	const int _SURROGATE_TAG_MASK = 0xFC00;
	const int _SURROGATE_VALUE_MASK = 0x3FF;
	const int _LEAD_SURROGATE_MIN = 0xD800;
	const int _TAIL_SURROGATE_MIN = 0xDC00;

	const int _REPLACEMENT_CHARACTER = 0xFFFD;
	const int _BOM_CHARACTER = 0xFEFF;

	bool _isSurrogate(int codeUnit) =>
	(codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN;
	bool _isLeadSurrogate(int codeUnit) =>
	(codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN;
	bool _isTailSurrogate(int codeUnit) =>
	(codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN;
	int _combineSurrogatePair(int lead, int tail) =>
	0x10000 \| ((lead & _SURROGATE_VALUE_MASK) << 10)
	\| (tail & _SURROGATE_VALUE_MASK);


	/**
	* Decodes UTF-8.
	*
	* The decoder handles chunked input.
	*/
	// TODO(floitsch): make this class public.
	class _Utf8Decoder {
	final bool _allowMalformed;
	bool _isFirstCharacter = true;
	int _value = 0;
	int _expectedUnits = 0;
	int _extraUnits = 0;

	_Utf8Decoder(this._allowMalformed);

	bool get hasPartialInput => _expectedUnits > 0;

	// Limits of one through four byte encodings.
	static const List<int> _LIMITS = const <int>[
	_ONE_BYTE_LIMIT,
	_TWO_BYTE_LIMIT,
	_THREE_BYTE_LIMIT,
	_FOUR_BYTE_LIMIT ];

	void close(StringSink sink) {
	if (hasPartialInput) {
	if (!_allowMalformed) {
	throw new FormatException("Unfinished UTF-8 octet sequence");
	}
	sink.writeCharCode(_REPLACEMENT_CHARACTER);
	}
	}

	void convert(List<int> codeUnits, int startIndex, int endIndex,
	StringSink sink) {
	int value = _value;
	int expectedUnits = _expectedUnits;
	int extraUnits = _extraUnits;
	_value = 0;
	_expectedUnits = 0;
	_extraUnits = 0;

	int i = startIndex;
	loop: while (true) {
	multibyte: if (expectedUnits > 0) {
	do {
	if (i == endIndex) {
	break loop;
	}
	int unit = codeUnits[i];
	if ((unit & 0xC0) != 0x80) {
	expectedUnits = 0;
	if (!_allowMalformed) {
	throw new FormatException(
	"Bad UTF-8 encoding 0x${unit.toRadixString(16)}");
	}
	_isFirstCharacter = false;
	sink.writeCharCode(_REPLACEMENT_CHARACTER);
	break multibyte;
	} else {
	value = (value << 6) \| (unit & 0x3f);
	expectedUnits--;
	i++;
	}
	} while (expectedUnits > 0);
	if (value <= _LIMITS[extraUnits - 1]) {
	// Overly long encoding. The value could be encoded with a shorter
	// encoding.
	if (!_allowMalformed) {
	throw new FormatException(
	"Overlong encoding of 0x${value.toRadixString(16)}");
	}
	expectedUnits = extraUnits = 0;
	value = _REPLACEMENT_CHARACTER;
	}
	if (value > _FOUR_BYTE_LIMIT) {
	if (!_allowMalformed) {
	throw new FormatException("Character outside valid Unicode range: "
	"0x${value.toRadixString(16)}");
	}
	value = _REPLACEMENT_CHARACTER;
	}
	if (!_isFirstCharacter \|\| value != _BOM_CHARACTER) {
	sink.writeCharCode(value);
	}
	_isFirstCharacter = false;
	}

	while (i < endIndex) {
	int unit = codeUnits[i++];
	if (unit <= _ONE_BYTE_LIMIT) {
	_isFirstCharacter = false;
	sink.writeCharCode(unit);
	} else {
	if ((unit & 0xE0) == 0xC0) {
	value = unit & 0x1F;
	expectedUnits = extraUnits = 1;
	continue loop;
	}
	if ((unit & 0xF0) == 0xE0) {
	value = unit & 0x0F;
	expectedUnits = extraUnits = 2;
	continue loop;
	}
	// 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences.
	if ((unit & 0xF8) == 0xF0 && unit < 0xF5) {
	value = unit & 0x07;
	expectedUnits = extraUnits = 3;
	continue loop;
	}
	if (!_allowMalformed) {
	throw new FormatException(
	"Bad UTF-8 encoding 0x${unit.toRadixString(16)}");
	}
	value = _REPLACEMENT_CHARACTER;
	expectedUnits = extraUnits = 0;
	_isFirstCharacter = false;
	sink.writeCharCode(value);
	}
	}
	break loop;
	}
	if (expectedUnits > 0) {
	_value = value;
	_expectedUnits = expectedUnits;
	_extraUnits = extraUnits;
	}
	}
	}