Decode Tar archives with malformed UTF-8 data in headers (#19) * Decode Tar archives with malformed UTF-8 data in headers Some Tar archives suck and leave malformed data in their headers, such as the [Arch Linux bootstrap image](http://mirror.rackspace.com/archlinux/iso/2022.02.01/archlinux-bootstrap-2022.02.01-x86_64.tar.gz). To make these decodable via tar, we need to not throw when handling these entries * Only decode the value if necessary If we know that we're going to discard the key, we can safely ignore decoding the value altogether * Still check that keys are valid Reject tar files with malformed keys. Co-authored-by: Simon Binder <oss@simonbinder.eu>

commit: 5144ca2ee16e03d968ae7acf3ad69164bf63f3f0 [log] [tgz]
author: Ani Betts <anais@anaisbetts.org> Fri Feb 18 16:24:48 2022 -0500
committer: GitHub <noreply@github.com> Fri Feb 18 22:24:48 2022 +0100
tree: d4b8f9ea9da632806260ffd92135c8f314028fe2
parent: 012050e3fc92ef5068b7415f15751e0083263dee [diff]
diff --git a/lib/src/reader.dart b/lib/src/reader.dart
index 37d46c3..b9bc3d3 100644
--- a/lib/src/reader.dart
+++ b/lib/src/reader.dart

@@ -799,18 +799,20 @@
       // Skip over the equals sign
       offset = nextEquals + 1;
 
-      // Subtract one for trailing newline
+      // Subtract one for trailing newline for value
       final endOfValue = endOfEntry - 1;
-      final value = utf8.decoder.convert(data, offset, endOfValue);
 
-      if (!_isValidPaxRecord(key, value)) {
+      if (!_isValidPaxKey(key)) {
         error();
       }
 
       // If we're seeing weird PAX Version 0.0 sparse keys, expect alternating
       // GNU.sparse.offset and GNU.sparse.numbytes headers.
       if (key == paxGNUSparseNumBytes || key == paxGNUSparseOffset) {
-        if ((sparseMap.length.isEven && key != paxGNUSparseOffset) ||
+        final value = utf8.decoder.convert(data, offset, endOfValue);
+
+        if (!_isValidPaxRecord(key, value) ||
+            (sparseMap.length.isEven && key != paxGNUSparseOffset) ||
             (sparseMap.length.isOdd && key != paxGNUSparseNumBytes) ||
             value.contains(',')) {
           error();
@@ -820,6 +822,12 @@
       } else if (!ignoreUnknown || supportedPaxHeaders.contains(key)) {
         // Ignore unrecognized headers to avoid unbounded growth of the global
         // header map.
+        final value = unsafeUtf8Decoder.convert(data, offset, endOfValue);
+
+        if (!_isValidPaxRecord(key, value)) {
+          error();
+        }
+
         map[key] = value;
       }
 
@@ -844,16 +852,23 @@
     }
   }
 
+  // NB: Some Tar files have malformed UTF-8 data in the headers, we should
+  // decode them anyways even if they're broken
+  static const unsafeUtf8Decoder = Utf8Decoder(allowMalformed: true);
+
+  static bool _isValidPaxKey(String key) {
+    // These limitations are documented in the PAX standard.
+    return key.isNotEmpty && !key.contains('=') & !key.codeUnits.contains(0);
+  }
+
   /// Checks whether [key], [value] is a valid entry in a pax header.
   ///
   /// This is adopted from the Golang tar reader (`validPAXRecord`), which says
   /// that "Keys and values should be UTF-8, but the number of bad writers out
   /// there forces us to be a more liberal."
   static bool _isValidPaxRecord(String key, String value) {
-    // These limitations are documented in the PAX standard.
-    if (key.isEmpty || key.contains('=')) return false;
-
-    // These aren't, but Golangs's tar has them and got away with it.
+    // These aren't documented in any standard, but Golangs's tar has them and
+    // got away with it.
     switch (key) {
       case paxPath:
       case paxLinkpath:
@@ -861,7 +876,7 @@
       case paxGname:
         return !value.codeUnits.contains(0);
       default:
-        return !key.codeUnits.contains(0);
+        return true;
     }
   }
 }
commit	5144ca2ee16e03d968ae7acf3ad69164bf63f3f0	[log] [tgz]
author	Ani Betts <anais@anaisbetts.org>	Fri Feb 18 16:24:48 2022 -0500
committer	GitHub <noreply@github.com>	Fri Feb 18 22:24:48 2022 +0100
tree	d4b8f9ea9da632806260ffd92135c8f314028fe2
parent	012050e3fc92ef5068b7415f15751e0083263dee [diff]