eazip/read/
mod.rs

1//! Utilities to read an archive.
2
3use std::{
4    borrow::Cow,
5    collections::HashMap,
6    io::{self, BufRead, Read, Seek},
7};
8
9use crate::{
10    CompressionMethod, Decompressor, FileType, types,
11    utils::{Crc32Checker, LengthChecker, Timestamp, cp437},
12};
13
14mod extra_field;
15mod raw;
16
17use extra_field::{ExtraField, ExtraFields};
18
19#[cold]
20fn invalid(msg: &str) -> io::Error {
21    io::Error::new(io::ErrorKind::InvalidData, msg)
22}
23
24#[cold]
25fn encrypted_file() -> io::Error {
26    io::Error::new(io::ErrorKind::Unsupported, "encrypted file")
27}
28
29#[cold]
30fn compressed() -> io::Error {
31    io::Error::new(io::ErrorKind::Unsupported, "compressed file")
32}
33
34fn validate_symlink(name: &str, target: &str) -> bool {
35    if target.starts_with('/') || target.contains('\\') || (cfg!(windows) && target.contains(':')) {
36        return false;
37    }
38
39    let mut depth = name.split('/').count() - 1;
40    for part in target.split('/') {
41        match part {
42            "." => (),
43            ".." => match depth.checked_sub(1) {
44                Some(d) => depth = d,
45                None => return false,
46            },
47            _ => depth += 1,
48        }
49    }
50
51    true
52}
53
54trait ReadSeek: Read + Seek {}
55impl<R: Read + Seek> ReadSeek for R {}
56
57trait BufReadSeek: BufRead + Seek {}
58impl<R: BufRead + Seek> BufReadSeek for R {}
59
60/// The method used to encrypt a file.
61///
62/// `eazip` does not provide the tools to decrypt these files, but provides the
63/// required metadata if you really need to.
64///
65/// This is only provided for completeness, please don't use this in scenarios
66/// where security actually matters and use proper tools (eg `age`).
67#[derive(Debug, Clone, Copy)]
68#[non_exhaustive]
69pub enum EncryptionMethod {
70    /// Legacy ZipCrypto encryption.
71    ZipCrypto,
72    /// The file is encrypted using AES in CTR mode.
73    ///
74    /// See [the specification](https://www.winzip.com/en/support/aes-encryption/#file-format1)
75    /// for the format of the encrypted files.
76    Aes {
77        /// The size of the AES key. This may be 128, 192 or 256 bytes.
78        key_size: u16,
79        /// Whether to check the CRC32 of the decypted content.
80        ///
81        /// If `true`, this will lead to data leak.
82        check_crc32: bool,
83    },
84}
85
86/// An open ZIP archive without a reader
87pub struct RawArchive {
88    entries: Vec<Metadata>,
89    comment: Box<[u8]>,
90}
91
92impl RawArchive {
93    /// Creates a `RawArchive` from a reader.
94    ///
95    /// The same reader should be used for other methods.
96    #[inline]
97    pub fn new<R: Read + Seek>(reader: &mut R) -> io::Result<Self> {
98        let (entries, comment) = raw::read_archive(reader)?;
99        Ok(Self { entries, comment })
100    }
101
102    /// Gets the list of entries in this archive.
103    #[inline]
104    pub fn entries(&self) -> &[Metadata] {
105        &self.entries
106    }
107
108    /// Gets the comment of the archive.
109    #[inline]
110    pub fn comment(&self) -> &[u8] {
111        &self.comment
112    }
113
114    /// Extracts the archive to the given directory.
115    ///
116    /// The directory will be created if needed, but *not* its parent.
117    pub fn extract<R: BufRead + Seek>(
118        &self,
119        reader: &mut R,
120        at: &std::path::Path,
121    ) -> io::Result<()> {
122        match std::fs::create_dir(at) {
123            Ok(()) => (),
124            Err(err) if err.kind() == io::ErrorKind::AlreadyExists => (),
125            Err(err) => return Err(err),
126        };
127
128        for entry in &self.entries {
129            entry.extract(reader, at)?;
130        }
131
132        Ok(())
133    }
134
135    /// Extracts the archive to the given directory in parallel.
136    ///
137    /// The directory will be created if needed, but *not* its parent.
138    ///
139    /// The reader should implement [`sync_file::ReadAt`], like [`io::Cursor`]
140    /// or [`sync_file::RandomAccessFile`].
141    #[cfg(feature = "parallel")]
142    pub fn parallel_extract<R: sync_file::ReadAt + sync_file::Size + Sync>(
143        &self,
144        reader: &R,
145        at: &std::path::Path,
146    ) -> io::Result<()> {
147        use rayon::prelude::*;
148
149        match std::fs::create_dir(at) {
150            Ok(()) => (),
151            Err(err) if err.kind() == io::ErrorKind::AlreadyExists => (),
152            Err(err) => return Err(err),
153        };
154
155        self.entries.par_iter().try_for_each_init(
156            || io::BufReader::new(sync_file::Adapter::new(reader)),
157            |reader, entry| entry.extract(reader, at),
158        )?;
159
160        Ok(())
161    }
162}
163
164impl FileType {
165    fn test(attr: u32, name: &str) -> Option<Self> {
166        let dos_attr = attr as u16;
167        let unix_mode = (attr >> 16) as u16;
168        let unix_kind = unix_mode >> 12;
169
170        let is_file = (dos_attr & (1 << 5)) != 0 || unix_kind == 8;
171        let is_dir = (dos_attr & (1 << 4)) != 0 || unix_kind == 4;
172        let is_symlink = unix_kind == 10;
173        let trailing_slash = name.ends_with('/');
174
175        match (is_file, is_dir, trailing_slash, is_symlink) {
176            (_, false, false, false) => Some(FileType::File),
177            (false, _, true, false) => Some(FileType::Directory),
178            (false, false, false, true) => Some(FileType::Symlink),
179            _ => None,
180        }
181    }
182}
183
184fn convert_string(raw: &[u8], force_unicode: bool) -> Option<(Cow<'_, str>, Option<u32>)> {
185    // MacOS stores the file name as UTF8, but does not use the unicode flag,
186    // and everyone seems fine with that, so if we meet UTF8 we'll just pretend
187    // that everything is fine.
188    if let Ok(name) = str::from_utf8(raw) {
189        return Some((Cow::Borrowed(name), None));
190    }
191
192    // If we didn't find UTF8 and it wasn't expected, handle it as CP437.
193    if force_unicode {
194        None
195    } else {
196        let name = cp437::convert(raw);
197        Some((Cow::Owned(name), Some(crc32fast::hash(raw))))
198    }
199}
200
201fn check_name(name: &str) -> Option<Box<str>> {
202    if name.starts_with('/')
203        || name.contains('\\')
204        || name.contains('\0')
205        || (cfg!(windows) && name.contains(':'))
206    {
207        return None;
208    }
209
210    let mut dst = String::with_capacity(name.len());
211    for part in name.split_inclusive('/') {
212        match part {
213            // Forbid parent parts as they have weird interactions with symlinks
214            "." | ".." | "../" => return None,
215            "/" | "./" => (),
216            _ => dst.push_str(part),
217        }
218    }
219
220    if dst.is_empty() {
221        return None;
222    }
223
224    Some(dst.into_boxed_str())
225}
226
227/// The metadata of a ZIP entry.
228#[derive(Debug)]
229pub struct Metadata {
230    header_offset: u64,
231    pub data_offset: u64,
232
233    pub compressed_size: u64,
234    pub uncompressed_size: u64,
235    pub compression_method: CompressionMethod,
236    pub crc32: u32,
237    pub file_type: FileType,
238
239    pub modification_time: Option<Timestamp>,
240    pub access_time: Option<Timestamp>,
241    pub creation_time: Option<Timestamp>,
242
243    pub encryption: Option<EncryptionMethod>,
244
245    name: Box<str>,
246    comment: Box<str>,
247
248    is_streaming: bool,
249    is_zip64: bool,
250    flags: u16,
251}
252
253impl Metadata {
254    fn from_local_header(
255        header: types::LocalFileHeader,
256        file_name: &[u8],
257        extra_fields: &[u8],
258    ) -> Option<Self> {
259        let flags = header.flags.get();
260        let is_encrypted = flags & (1 << 0) != 0;
261        let is_streaming = flags & (1 << 3) != 0;
262        let is_unicode = flags & (1 << 11) != 0;
263
264        if { header.signature } != types::LocalFileHeader::SIGNATURE {
265            return None;
266        }
267
268        let (name, name_crc) = convert_string(file_name, is_unicode)?;
269        let name = check_name(&name)?;
270
271        let mut meta = Self {
272            crc32: header.crc32.get(),
273            encryption: is_encrypted.then_some(EncryptionMethod::ZipCrypto),
274            header_offset: 0,
275            data_offset: 0,
276
277            compressed_size: header.compressed_size.get() as u64,
278            uncompressed_size: header.uncompressed_size.get() as u64,
279            compression_method: CompressionMethod(header.compression_method.get()),
280            file_type: FileType::File,
281
282            modification_time: None,
283            access_time: None,
284            creation_time: None,
285
286            name,
287            comment: Box::default(),
288
289            is_streaming,
290            is_zip64: false,
291            flags,
292        };
293
294        meta.parse_extra_fields(ExtraFields(extra_fields), name_crc, None)?;
295
296        Some(meta)
297    }
298
299    fn from_central_header(
300        header: types::CentralFileHeader,
301        file_name: &[u8],
302        extra_fields: &[u8],
303        comment: &[u8],
304    ) -> Option<Self> {
305        let flags = header.flags.get();
306        let is_encrypted = flags & (1 << 0) != 0;
307        let is_streaming = flags & (1 << 3) != 0;
308        let is_unicode = flags & (1 << 11) != 0;
309
310        if { header.signature } != types::CentralFileHeader::SIGNATURE
311            || header.disk_number.get() != 0
312        {
313            return None;
314        }
315
316        let (comment, comment_crc) = convert_string(comment, is_unicode)?;
317        let comment = comment.into_owned().into_boxed_str();
318        let (name, name_crc) = convert_string(file_name, is_unicode)?;
319        let name = check_name(&name)?;
320        let file_type = FileType::test(header.external_attributes.get(), &name)?;
321
322        let mut meta = Self {
323            crc32: header.crc32.get(),
324            encryption: is_encrypted.then_some(EncryptionMethod::ZipCrypto),
325            header_offset: header.local_header_offset.get() as u64,
326            data_offset: 0,
327
328            compressed_size: header.compressed_size.get() as u64,
329            uncompressed_size: header.uncompressed_size.get() as u64,
330            compression_method: CompressionMethod(header.compression_method.get()),
331            file_type,
332
333            modification_time: None,
334            access_time: None,
335            creation_time: None,
336
337            name,
338            comment,
339
340            is_streaming,
341            is_zip64: false,
342            flags,
343        };
344
345        meta.parse_extra_fields(ExtraFields(extra_fields), name_crc, comment_crc)?;
346
347        Some(meta)
348    }
349
350    fn parse_extra_fields(
351        &mut self,
352        extra_fields: ExtraFields,
353        name_crc: Option<u32>,
354        comment_crc: Option<u32>,
355    ) -> Option<()> {
356        for field in extra_fields.iter() {
357            match field {
358                ExtraField::Zip64ExtendedInformation(mut info) => {
359                    if self.uncompressed_size == 0xffff_ffff {
360                        self.uncompressed_size = info.next()?;
361                    }
362                    if self.compressed_size == 0xffff_ffff {
363                        self.compressed_size = info.next()?;
364                    }
365                    if self.header_offset == 0xffff_ffff {
366                        self.header_offset = info.next()?;
367                    }
368                    // Disk number must be 0
369                    info.end()?;
370                    self.is_zip64 = true;
371                }
372                ExtraField::UnicodeComment(unicode) => {
373                    if Some(unicode.header_comment_crc32) != comment_crc {
374                        return None;
375                    }
376                    self.comment = unicode.comment.into();
377                }
378
379                ExtraField::UnicodeName(unicode) => {
380                    if Some(unicode.header_name_crc32) != name_crc {
381                        return None;
382                    }
383                    self.name = check_name(unicode.name)?;
384                }
385
386                ExtraField::Ntfs(ntfs) => {
387                    self.modification_time = ntfs.times.mtime;
388                    self.access_time = ntfs.times.atime;
389                    self.creation_time = ntfs.times.ctime;
390                }
391
392                ExtraField::ExtendedTimestamp(ts) => {
393                    self.modification_time = ts.modification_time;
394                    self.access_time = ts.access_time;
395                    self.creation_time = ts.creation_time;
396                }
397
398                ExtraField::Aes(aes) => {
399                    if self.compression_method != CompressionMethod::AES
400                        || (!aes.check_crc32 && self.crc32 != 0)
401                    {
402                        return None;
403                    }
404                    let Some(enc @ EncryptionMethod::ZipCrypto) = &mut self.encryption else {
405                        return None;
406                    };
407
408                    *enc = EncryptionMethod::Aes {
409                        key_size: aes.key_size,
410                        check_crc32: aes.check_crc32,
411                    };
412                    self.compression_method = aes.compression;
413                }
414
415                ExtraField::Invalid(_, _) => return None,
416
417                _ => (),
418            }
419        }
420
421        if self.compression_method == CompressionMethod::AES {
422            return None;
423        }
424
425        Some(())
426    }
427
428    /// Returns `true` if this file is encrypted.
429    #[inline]
430    pub fn is_encrypted(&self) -> bool {
431        self.encryption.is_some()
432    }
433
434    /// Gets the name of this entry.
435    #[inline]
436    pub fn name(&self) -> &str {
437        &self.name
438    }
439
440    /// Gets the comment of this entry.
441    #[inline]
442    pub fn comment(&self) -> &str {
443        &self.comment
444    }
445
446    /// Returns a reader with the content of the file.
447    ///
448    /// Unsupported compression methods and encrypted files will return an error.
449    pub fn read<R: BufRead + Seek>(&self, reader: R) -> io::Result<impl Read + use<R>> {
450        if self.encryption.is_some() {
451            return Err(encrypted_file());
452        }
453
454        let reader = Decompressor::new(self.read_raw(reader)?, self.compression_method)?;
455        Ok(self.content_checker(reader))
456    }
457
458    /// Returns a reader with the content of the file.
459    ///
460    /// Errors if the file is compressed, encrypted or corrupted. Is is not
461    /// necessary to use `Metadata::content_checker` on the result.
462    ///
463    /// It is useful if you know that the file is stored as-is and you want to
464    /// take advantage of the `BufReader` or the `Seek` implementation.
465    pub fn read_stored<R: Read + Seek>(&self, mut reader: R) -> io::Result<io::Take<R>> {
466        if self.encryption.is_some() {
467            return Err(encrypted_file());
468        }
469        if self.compression_method != CompressionMethod::STORE {
470            return Err(compressed());
471        }
472
473        // Check CRC beforehand. Length has already been checked.
474        let mut checker = Crc32Checker::new(self.read_raw(&mut reader)?, self.crc32);
475        std::io::copy(&mut checker, &mut io::sink())?;
476
477        self.read_raw(reader)
478    }
479
480    /// Returns a reader with the raw, uncompressed, content of the file.
481    ///
482    /// The uncompressed content should be checked with `content_checker`.
483    pub fn read_raw<R: Read + Seek>(&self, mut reader: R) -> io::Result<io::Take<R>> {
484        reader.seek(io::SeekFrom::Start(self.data_offset))?;
485        Ok(reader.take(self.compressed_size))
486    }
487
488    /// Wraps a reader to check that its content matches this metadata.
489    ///
490    /// It is particularly  useful in combinaison of `read_raw`.
491    #[inline]
492    pub fn content_checker<R: Read>(&self, reader: R) -> impl Read + use<R> {
493        Crc32Checker::new(
494            LengthChecker::new(reader, self.uncompressed_size),
495            self.crc32,
496        )
497    }
498
499    /// Extracts this entry as if the root of the archive was at `root`.
500    #[inline]
501    pub fn extract<R: BufRead + Seek>(
502        &self,
503        reader: &mut R,
504        root: impl AsRef<std::path::Path>,
505    ) -> io::Result<()> {
506        self._extract(reader, root.as_ref())
507    }
508
509    fn _extract(&self, reader: &mut dyn BufReadSeek, at: &std::path::Path) -> io::Result<()> {
510        if !std::fs::metadata(at)?.is_dir() {
511            return Err(io::Error::from(io::ErrorKind::NotFound));
512        }
513
514        let path = at.join(&*self.name);
515        std::fs::create_dir_all(path.parent().unwrap())?;
516
517        match self.file_type {
518            FileType::File => {
519                let mut f = std::fs::File::create_new(&path)?;
520                io::copy(&mut self.read(reader)?, &mut f)?;
521
522                if let Some(mod_time) = self.modification_time {
523                    f.set_times(std::fs::FileTimes::new().set_modified(mod_time.to_std()))?;
524                }
525            }
526            FileType::Directory => {
527                std::fs::create_dir(path)?;
528            }
529            FileType::Symlink => {
530                let target = io::read_to_string(self.read(reader)?)?;
531                if !validate_symlink(&self.name, &target) {
532                    return Err(invalid("invalid symlink target"));
533                }
534
535                #[cfg(unix)]
536                std::os::unix::fs::symlink(target, path)?;
537
538                #[cfg(windows)]
539                if target.ends_with('/') {
540                    std::os::windows::fs::symlink_dir(target, path)?;
541                } else {
542                    std::os::windows::fs::symlink_file(target, path)?;
543                }
544
545                #[cfg(not(any(unix, windows)))]
546                std::fs::write(path, target.as_bytes())?;
547            }
548        }
549
550        Ok(())
551    }
552}
553
554/// An open ZIP archive.
555///
556/// This type owns the reader. If you need something more flexible, use
557/// [`RawArchive`] instead.
558///
559/// # Example
560///
561/// Print the name and content of each file in the archive:
562///
563/// ```no_run
564/// let mut archive = eazip::Archive::open("example.zip")?;
565///
566/// for i in 0..archive.entries().len() {
567///     let mut entry = archive.get_by_index(i).unwrap();
568///     let name = entry.metadata().name();
569///     let content = std::io::read_to_string(entry.read()?)?;
570///
571///     println!("{name}: {content}");
572/// }
573///
574/// # Ok::<(), std::io::Error>(())
575/// ```
576pub struct Archive<R> {
577    inner: RawArchive,
578    names: HashMap<Box<str>, usize>,
579    reader: R,
580}
581
582impl Archive<io::BufReader<std::fs::File>> {
583    /// Opens the given file as a ZIP archive.
584    #[inline]
585    pub fn open(path: impl AsRef<std::path::Path>) -> io::Result<Self> {
586        Self::_open(path.as_ref())
587    }
588
589    fn _open(path: &std::path::Path) -> io::Result<Self> {
590        Self::new(io::BufReader::new(std::fs::File::open(path)?))
591    }
592}
593
594#[cfg(feature = "parallel")]
595impl Archive<io::BufReader<sync_file::SyncFile>> {
596    /// Opens the given file as a ZIP archive ready for parallel extract.
597    #[inline]
598    pub fn open_parallel(path: impl AsRef<std::path::Path>) -> io::Result<Self> {
599        Self::_open(path.as_ref())
600    }
601
602    fn _open(path: &std::path::Path) -> io::Result<Self> {
603        Self::new(io::BufReader::new(sync_file::SyncFile::open(path)?))
604    }
605}
606
607impl<R: BufRead + Seek> Archive<R> {
608    /// Opens a ZIP archive from a reader.
609    ///
610    /// This also perform many validation checks on the archive to make sure
611    /// that is it well-formed and does not have dangerous or duplicated paths.
612    /// The validity of file contents is checked lazily when reading them.
613    ///
614    /// The exact rules around validation are not part of semver guaranties and
615    /// may change at every release.
616    ///
617    /// **The targets of symlinks are not checked yet here**, though they are
618    /// through `extract` and `extract_parallel`.
619    pub fn new(mut reader: R) -> io::Result<Self> {
620        let inner = RawArchive::new(&mut reader)?;
621
622        let names = inner
623            .entries()
624            .iter()
625            .enumerate()
626            .map(|(i, meta)| (meta.name().into(), i))
627            .collect();
628
629        Ok(Self {
630            inner,
631            names,
632            reader,
633        })
634    }
635
636    /// Gets the list of entries in the archive.
637    #[inline]
638    pub fn entries(&self) -> &[Metadata] {
639        &self.inner.entries
640    }
641
642    /// Gets a file by its index.
643    #[inline]
644    pub fn get_by_index(&mut self, index: usize) -> Option<File<'_, R>> {
645        let metadata = self.inner.entries().get(index)?;
646        Some(File {
647            metadata,
648            reader: &mut self.reader,
649        })
650    }
651
652    /// Gets a file by its name.
653    pub fn get_by_name(&mut self, name: &str) -> Option<File<'_, R>> {
654        let index = *self.names.get(name)?;
655        self.get_by_index(index)
656    }
657
658    /// Gets the index of a file in [`Self::entries`] by its name.
659    pub fn index_of(&self, name: &str) -> Option<usize> {
660        self.names.get(name).copied()
661    }
662
663    /// Gets the comment of the archive.
664    #[inline]
665    pub fn commment(&self) -> &[u8] {
666        &self.inner.comment
667    }
668
669    /// Extracts the archive to the given directory.
670    ///
671    /// The directory will be created if needed, but *not* its parent.
672    #[inline]
673    pub fn extract(&mut self, at: impl AsRef<std::path::Path>) -> io::Result<()> {
674        self.inner.extract(&mut self.reader, at.as_ref())
675    }
676
677    /// Extracts the archive to the given directory in parallel.
678    ///
679    /// The directory will be created if needed, but *not* its parent.
680    ///
681    /// The reader should implement [`sync_file::ReadAt`], like [`io::Cursor`]
682    /// or [`sync_file::SyncFile`].
683    #[cfg(feature = "parallel")]
684    #[inline]
685    pub fn parallel_extract(&self, at: impl AsRef<std::path::Path>) -> io::Result<()>
686    where
687        R: sync_file::ReadAt + sync_file::Size + Sync,
688    {
689        self.inner.parallel_extract(&self.reader, at.as_ref())
690    }
691
692    /// Gets a shared reference to the underlying reader.
693    #[inline]
694    pub fn get_ref(&self) -> &R {
695        &self.reader
696    }
697
698    /// Gets a mutable reference to the underlying reader.
699    #[inline]
700    pub fn get_mut(&mut self) -> &mut R {
701        &mut self.reader
702    }
703}
704
705/// A file in a ZIP archive.
706pub struct File<'a, R> {
707    metadata: &'a Metadata,
708    reader: &'a mut R,
709}
710
711impl<'a, R: BufRead + Seek> File<'a, R> {
712    /// Gets the metadata of the file.
713    ///
714    /// The lifetime of the returned reference is bound to the `Archive`, so it
715    /// can outlive `self`.
716    #[inline]
717    pub fn metadata(&self) -> &'a Metadata {
718        self.metadata
719    }
720
721    /// Returns a reader with the content of the file.
722    ///
723    /// Unsupported compression methods will return an error.
724    #[inline]
725    pub fn read(&mut self) -> io::Result<impl Read + '_> {
726        self.metadata.read(&mut *self.reader)
727    }
728
729    /// Returns a reader with the content of the file.
730    ///
731    /// Errors if the file is compressed, encrypted or corrupted. Is is not
732    /// necessary to use `Metadata::content_checker` on the result.
733    ///
734    /// It is useful if you know that the file is stored as-is and you want to
735    /// take advantage of the `BufReader` or the `Seek` implementation.
736    pub fn read_stored(self) -> io::Result<io::Take<&'a mut R>> {
737        self.metadata.read_stored(self.reader)
738    }
739
740    /// Returns a reader with the raw, compressed, content of the file.
741    ///
742    /// The uncompressed content should be checked with [`Metadata::content_checker`].
743    #[inline]
744    pub fn read_raw(&mut self) -> io::Result<io::Take<&mut R>> {
745        self.metadata.read_raw(self.reader)
746    }
747
748    /// Consumes self, returning the underlying reader.
749    ///
750    /// This reader can be used with [`Metadata::read_raw`] to read the raw,
751    /// compressed content of the file. The uncompressed content should then be
752    /// checked with [`Metadata::content_checker`].
753    pub fn into_reader(self) -> &'a mut R {
754        self.reader
755    }
756}
757
758#[test]
759fn symlink_validation() {
760    assert!(validate_symlink("a/b", "../c"));
761    assert!(!validate_symlink("a/b", "../../c"));
762    assert!(!validate_symlink("a/b", "/c"));
763    #[cfg(windows)]
764    assert!(!validate_symlink("a/b", "C:/e"));
765}