eazip/read/
mod.rs

1//! Utilities to read an archive.
2
3use std::{
4    borrow::Cow,
5    collections::HashMap,
6    io::{self, BufRead, Read, Seek},
7};
8
9use crate::{CompressionMethod, Decompressor, FileType, Timestamp, types, utils};
10
11mod extra_field;
12mod raw;
13
14use extra_field::{ExtraField, ExtraFields};
15
16#[cold]
17fn invalid(msg: &str) -> io::Error {
18    io::Error::new(io::ErrorKind::InvalidData, msg)
19}
20
21#[cold]
22fn encrypted_file() -> io::Error {
23    io::Error::new(io::ErrorKind::Unsupported, "encrypted file")
24}
25
26#[cold]
27fn compressed() -> io::Error {
28    io::Error::new(io::ErrorKind::Unsupported, "compressed file")
29}
30
31trait ReadSeek: Read + Seek {}
32impl<R: Read + Seek> ReadSeek for R {}
33
34trait BufReadSeek: BufRead + Seek {}
35impl<R: BufRead + Seek> BufReadSeek for R {}
36
37/// The method used to encrypt a file.
38///
39/// `eazip` does not provide the tools to decrypt these files, but provides the
40/// required metadata if you really need to.
41///
42/// This is only provided for completeness, please don't use this in scenarios
43/// where security actually matters and use proper tools (eg `age`).
44#[derive(Debug, Clone, Copy)]
45#[non_exhaustive]
46pub enum EncryptionMethod {
47    /// Legacy ZipCrypto encryption.
48    ZipCrypto,
49    /// PKWARE proprietary "Strong Encryption".
50    StrongEncrytion,
51    /// The file is encrypted using AES in CTR mode.
52    ///
53    /// See [the specification](https://www.winzip.com/en/support/aes-encryption/#file-format1)
54    /// for the format of the encrypted files.
55    Aes {
56        /// The size of the AES key. This may be 128, 192 or 256 bytes.
57        key_size: u16,
58        /// Whether to check the CRC32 of the decypted content.
59        ///
60        /// If `true`, this will lead to data leak.
61        check_crc32: bool,
62    },
63}
64
65/// An open ZIP archive without a reader
66#[derive(Debug)]
67pub struct RawArchive {
68    entries: Vec<Metadata>,
69    comment: Box<[u8]>,
70}
71
72impl RawArchive {
73    /// Creates a `RawArchive` from a reader.
74    ///
75    /// The same reader should be used for other methods.
76    #[inline]
77    pub fn new<R: Read + Seek>(reader: &mut R) -> io::Result<Self> {
78        let (entries, comment) = raw::read_archive(reader)?;
79        Ok(Self { entries, comment })
80    }
81
82    /// Gets the list of entries in this archive.
83    #[inline]
84    pub fn entries(&self) -> &[Metadata] {
85        &self.entries
86    }
87
88    /// Gets the comment of the archive.
89    #[inline]
90    pub fn comment(&self) -> &[u8] {
91        &self.comment
92    }
93
94    /// Extracts the archive to the given directory.
95    ///
96    /// The directory will be created if needed, but *not* its parent.
97    pub fn extract<R: BufRead + Seek>(
98        &self,
99        reader: &mut R,
100        at: &std::path::Path,
101    ) -> io::Result<()> {
102        match std::fs::create_dir(at) {
103            Ok(()) => (),
104            Err(err) if err.kind() == io::ErrorKind::AlreadyExists => (),
105            Err(err) => return Err(err),
106        };
107
108        for entry in &self.entries {
109            entry.extract(reader, at)?;
110        }
111
112        Ok(())
113    }
114
115    /// Extracts the archive to the given directory in parallel.
116    ///
117    /// The directory will be created if needed, but *not* its parent.
118    ///
119    /// The reader should implement [`sync_file::ReadAt`], like [`io::Cursor`]
120    /// or [`sync_file::RandomAccessFile`].
121    #[cfg(feature = "parallel")]
122    pub fn parallel_extract<R: sync_file::ReadAt + sync_file::Size + Sync>(
123        &self,
124        reader: &R,
125        at: &std::path::Path,
126    ) -> io::Result<()> {
127        use rayon::prelude::*;
128
129        match std::fs::create_dir(at) {
130            Ok(()) => (),
131            Err(err) if err.kind() == io::ErrorKind::AlreadyExists => (),
132            Err(err) => return Err(err),
133        };
134
135        self.entries.par_iter().try_for_each_init(
136            || io::BufReader::new(sync_file::Adapter::new(reader)),
137            |reader, entry| entry.extract(reader, at),
138        )?;
139
140        Ok(())
141    }
142}
143
144impl FileType {
145    fn test(attr: u32, name: &str) -> Option<Self> {
146        let dos_attr = attr as u16;
147        let unix_mode = (attr >> 16) as u16;
148        let unix_kind = unix_mode >> 12;
149
150        let is_file = (dos_attr & (1 << 5)) != 0 || unix_kind == 8;
151        let is_dir = (dos_attr & (1 << 4)) != 0 || unix_kind == 4;
152        let is_symlink = unix_kind == 10;
153        let trailing_slash = name.ends_with('/');
154
155        match (is_file, is_dir, trailing_slash, is_symlink) {
156            (_, false, false, false) => Some(FileType::File),
157            (false, _, true, false) => Some(FileType::Directory),
158            (false, false, false, true) => Some(FileType::Symlink),
159            _ => None,
160        }
161    }
162}
163
164fn convert_string(raw: &[u8], force_unicode: bool) -> Option<(Cow<'_, str>, Option<u32>)> {
165    // MacOS stores the file name as UTF8, but does not use the unicode flag,
166    // and everyone seems fine with that, so if we meet UTF8 we'll just pretend
167    // that everything is fine.
168    if let Ok(name) = str::from_utf8(raw) {
169        return Some((Cow::Borrowed(name), None));
170    }
171
172    // If we didn't find UTF8 and it wasn't expected, handle it as CP437.
173    if force_unicode {
174        None
175    } else {
176        let name = utils::cp437::convert(raw);
177        Some((Cow::Owned(name), Some(crc32fast::hash(raw))))
178    }
179}
180
181/// The metadata of a ZIP entry.
182#[derive(Debug)]
183pub struct Metadata {
184    header_offset: u64,
185    pub data_offset: u64,
186
187    pub compressed_size: u64,
188    pub uncompressed_size: u64,
189    pub compression_method: CompressionMethod,
190    pub crc32: u32,
191    pub file_type: FileType,
192
193    pub modification_time: Option<Timestamp>,
194    pub access_time: Option<Timestamp>,
195    pub creation_time: Option<Timestamp>,
196
197    pub encryption: Option<EncryptionMethod>,
198
199    name: Box<str>,
200    comment: Box<str>,
201
202    is_streaming: bool,
203    is_zip64: bool,
204    flags: u16,
205}
206
207impl Metadata {
208    fn from_local_header(
209        header: types::LocalFileHeader,
210        file_name: &[u8],
211        extra_fields: &[u8],
212    ) -> Option<Self> {
213        let flags = header.flags.get();
214        let is_encrypted = flags & (1 << 0) != 0;
215        let is_streaming = flags & (1 << 3) != 0;
216        let strong_encryption = flags & (1 << 6) != 0;
217        let is_unicode = flags & (1 << 11) != 0;
218
219        if { header.signature } != types::LocalFileHeader::SIGNATURE {
220            return None;
221        }
222
223        let (name, name_crc) = convert_string(file_name, is_unicode)?;
224        let name = utils::validate_name(&name)?;
225
226        let encryption = match (is_encrypted, strong_encryption) {
227            (false, false) => None,
228            (false, true) => return None,
229            (true, false) => Some(EncryptionMethod::ZipCrypto),
230            (true, true) => Some(EncryptionMethod::StrongEncrytion),
231        };
232
233        let mut meta = Self {
234            crc32: header.crc32.get(),
235            encryption,
236            header_offset: 0,
237            data_offset: 0,
238
239            compressed_size: header.compressed_size.get() as u64,
240            uncompressed_size: header.uncompressed_size.get() as u64,
241            compression_method: CompressionMethod(header.compression_method.get()),
242            file_type: FileType::File,
243
244            modification_time: None,
245            access_time: None,
246            creation_time: None,
247
248            name,
249            comment: Box::default(),
250
251            is_streaming,
252            is_zip64: false,
253            flags,
254        };
255
256        meta.parse_extra_fields(ExtraFields(extra_fields), name_crc, None)?;
257
258        Some(meta)
259    }
260
261    fn from_central_header(
262        header: types::CentralFileHeader,
263        file_name: &[u8],
264        extra_fields: &[u8],
265        comment: &[u8],
266    ) -> Option<Self> {
267        let flags = header.flags.get();
268        let is_encrypted = flags & (1 << 0) != 0;
269        let is_streaming = flags & (1 << 3) != 0;
270        let strong_encryption = flags & (1 << 6) != 0;
271        let is_unicode = flags & (1 << 11) != 0;
272
273        if { header.signature } != types::CentralFileHeader::SIGNATURE
274            || header.disk_number.get() != 0
275        {
276            return None;
277        }
278
279        let (comment, comment_crc) = convert_string(comment, is_unicode)?;
280        let comment = comment.into_owned().into_boxed_str();
281        let (name, name_crc) = convert_string(file_name, is_unicode)?;
282        let name = utils::validate_name(&name)?;
283        let file_type = FileType::test(header.external_attributes.get(), &name)?;
284
285        let encryption = match (is_encrypted, strong_encryption) {
286            (false, false) => None,
287            (false, true) => return None,
288            (true, false) => Some(EncryptionMethod::ZipCrypto),
289            (true, true) => Some(EncryptionMethod::StrongEncrytion),
290        };
291
292        let mut meta = Self {
293            crc32: header.crc32.get(),
294            encryption,
295            header_offset: header.local_header_offset.get() as u64,
296            data_offset: 0,
297
298            compressed_size: header.compressed_size.get() as u64,
299            uncompressed_size: header.uncompressed_size.get() as u64,
300            compression_method: CompressionMethod(header.compression_method.get()),
301            file_type,
302
303            modification_time: None,
304            access_time: None,
305            creation_time: None,
306
307            name,
308            comment,
309
310            is_streaming,
311            is_zip64: false,
312            flags,
313        };
314
315        meta.parse_extra_fields(ExtraFields(extra_fields), name_crc, comment_crc)?;
316
317        Some(meta)
318    }
319
320    fn parse_extra_fields(
321        &mut self,
322        extra_fields: ExtraFields,
323        name_crc: Option<u32>,
324        comment_crc: Option<u32>,
325    ) -> Option<()> {
326        for field in extra_fields.iter() {
327            match field {
328                ExtraField::Zip64ExtendedInformation(mut info) => {
329                    if self.uncompressed_size == 0xffff_ffff {
330                        self.uncompressed_size = info.next()?;
331                    }
332                    if self.compressed_size == 0xffff_ffff {
333                        self.compressed_size = info.next()?;
334                    }
335                    if self.header_offset == 0xffff_ffff {
336                        self.header_offset = info.next()?;
337                    }
338                    // Disk number must be 0
339                    info.end()?;
340                    self.is_zip64 = true;
341                }
342                ExtraField::UnicodeComment(unicode) => {
343                    if Some(unicode.header_comment_crc32) != comment_crc {
344                        return None;
345                    }
346                    self.comment = unicode.comment.into();
347                }
348
349                ExtraField::UnicodeName(unicode) => {
350                    if Some(unicode.header_name_crc32) != name_crc {
351                        return None;
352                    }
353                    self.name = utils::validate_name(unicode.name)?;
354                }
355
356                ExtraField::Ntfs(ntfs) => {
357                    self.modification_time = ntfs.times.mtime;
358                    self.access_time = ntfs.times.atime;
359                    self.creation_time = ntfs.times.ctime;
360                }
361
362                ExtraField::ExtendedTimestamp(ts) => {
363                    self.modification_time = ts.modification_time;
364                    self.access_time = ts.access_time;
365                    self.creation_time = ts.creation_time;
366                }
367
368                ExtraField::Aes(aes) => {
369                    if self.compression_method != CompressionMethod::AES
370                        || (!aes.check_crc32 && self.crc32 != 0)
371                    {
372                        return None;
373                    }
374                    let Some(enc @ EncryptionMethod::ZipCrypto) = &mut self.encryption else {
375                        return None;
376                    };
377
378                    *enc = EncryptionMethod::Aes {
379                        key_size: aes.key_size,
380                        check_crc32: aes.check_crc32,
381                    };
382                    self.compression_method = aes.compression;
383                }
384
385                ExtraField::Invalid => return None,
386
387                _ => (),
388            }
389        }
390
391        if self.compression_method == CompressionMethod::AES {
392            return None;
393        }
394
395        Some(())
396    }
397
398    /// Returns `true` if this file is encrypted.
399    #[inline]
400    pub fn is_encrypted(&self) -> bool {
401        self.encryption.is_some()
402    }
403
404    /// Gets the name of this entry.
405    #[inline]
406    pub fn name(&self) -> &str {
407        &self.name
408    }
409
410    /// Gets the comment of this entry.
411    #[inline]
412    pub fn comment(&self) -> &str {
413        &self.comment
414    }
415
416    /// Returns a reader with the content of the file.
417    ///
418    /// Unsupported compression methods and encrypted files will return an error.
419    pub fn read<R: BufRead + Seek>(&self, reader: R) -> io::Result<impl Read + use<R>> {
420        if self.encryption.is_some() {
421            return Err(encrypted_file());
422        }
423
424        let reader = Decompressor::new(self.read_raw(reader)?, self.compression_method)?;
425        Ok(self.content_checker(reader))
426    }
427
428    /// Returns a reader with the content of the file.
429    ///
430    /// Errors if the file is compressed, encrypted or corrupted. Is is not
431    /// necessary to use `Metadata::content_checker` on the result.
432    ///
433    /// It is useful if you know that the file is stored as-is and you want to
434    /// take advantage of the `BufReader` or the `Seek` implementation.
435    pub fn read_stored<R: Read + Seek>(&self, mut reader: R) -> io::Result<io::Take<R>> {
436        if self.encryption.is_some() {
437            return Err(encrypted_file());
438        }
439        if self.compression_method != CompressionMethod::STORE {
440            return Err(compressed());
441        }
442
443        // Check CRC beforehand. Length has already been checked.
444        let mut checker = utils::Crc32Checker::new(self.read_raw(&mut reader)?, self.crc32);
445        std::io::copy(&mut checker, &mut io::sink())?;
446
447        self.read_raw(reader)
448    }
449
450    /// Returns a reader with the raw, uncompressed, content of the file.
451    ///
452    /// The uncompressed content should be checked with `content_checker`.
453    pub fn read_raw<R: Read + Seek>(&self, mut reader: R) -> io::Result<io::Take<R>> {
454        reader.seek(io::SeekFrom::Start(self.data_offset))?;
455        Ok(reader.take(self.compressed_size))
456    }
457
458    /// Wraps a reader to check that its content matches this metadata.
459    ///
460    /// It is particularly  useful in combinaison of `read_raw`.
461    #[inline]
462    pub fn content_checker<R: Read>(&self, reader: R) -> impl Read + use<R> {
463        utils::Crc32Checker::new(
464            utils::LengthChecker::new(reader, self.uncompressed_size),
465            self.crc32,
466        )
467    }
468
469    /// Extracts this entry as if the root of the archive was at `root`.
470    #[inline]
471    pub fn extract<R: BufRead + Seek>(
472        &self,
473        reader: &mut R,
474        root: impl AsRef<std::path::Path>,
475    ) -> io::Result<()> {
476        self._extract(reader, root.as_ref())
477    }
478
479    fn _extract(&self, reader: &mut dyn BufReadSeek, at: &std::path::Path) -> io::Result<()> {
480        if !std::fs::metadata(at)?.is_dir() {
481            return Err(io::Error::from(io::ErrorKind::NotFound));
482        }
483
484        let path = at.join(&*self.name);
485        std::fs::create_dir_all(path.parent().unwrap())?;
486
487        match self.file_type {
488            FileType::File => {
489                let mut f = std::fs::File::create_new(&path)?;
490                io::copy(&mut self.read(reader)?, &mut f)?;
491
492                if let Some(mod_time) = self.modification_time {
493                    f.set_times(std::fs::FileTimes::new().set_modified(mod_time.to_std()))?;
494                }
495            }
496            FileType::Directory => {
497                std::fs::create_dir(path)?;
498            }
499            FileType::Symlink => {
500                let target = io::read_to_string(self.read(reader)?)?;
501                if !utils::validate_symlink(&self.name, &target) {
502                    return Err(invalid("invalid symlink target"));
503                }
504
505                #[cfg(unix)]
506                std::os::unix::fs::symlink(target, path)?;
507
508                #[cfg(windows)]
509                if target.ends_with('/') {
510                    std::os::windows::fs::symlink_dir(target, path)?;
511                } else {
512                    std::os::windows::fs::symlink_file(target, path)?;
513                }
514
515                #[cfg(not(any(unix, windows)))]
516                std::fs::write(path, target.as_bytes())?;
517            }
518        }
519
520        Ok(())
521    }
522}
523
524/// An open ZIP archive.
525///
526/// This type owns the reader. If you need something more flexible, use
527/// [`RawArchive`] instead.
528///
529/// # Example
530///
531/// Print the name and content of each file in the archive:
532///
533/// ```no_run
534/// let mut archive = eazip::Archive::open("example.zip")?;
535///
536/// for i in 0..archive.entries().len() {
537///     let mut entry = archive.get_by_index(i).unwrap();
538///     let name = entry.metadata().name();
539///     let content = std::io::read_to_string(entry.read()?)?;
540///
541///     println!("{name}: {content}");
542/// }
543///
544/// # Ok::<(), std::io::Error>(())
545/// ```
546#[derive(Debug)]
547pub struct Archive<R> {
548    inner: RawArchive,
549    names: HashMap<Box<str>, usize>,
550    reader: R,
551}
552
553impl Archive<io::BufReader<std::fs::File>> {
554    /// Opens the given file as a ZIP archive.
555    #[inline]
556    pub fn open(path: impl AsRef<std::path::Path>) -> io::Result<Self> {
557        Self::_open(path.as_ref())
558    }
559
560    fn _open(path: &std::path::Path) -> io::Result<Self> {
561        Self::new(io::BufReader::new(std::fs::File::open(path)?))
562    }
563}
564
565#[cfg(feature = "parallel")]
566impl Archive<io::BufReader<sync_file::SyncFile>> {
567    /// Opens the given file as a ZIP archive ready for parallel extract.
568    #[inline]
569    pub fn open_parallel(path: impl AsRef<std::path::Path>) -> io::Result<Self> {
570        Self::_open(path.as_ref())
571    }
572
573    fn _open(path: &std::path::Path) -> io::Result<Self> {
574        Self::new(io::BufReader::new(sync_file::SyncFile::open(path)?))
575    }
576}
577
578impl<R: BufRead + Seek> Archive<R> {
579    /// Opens a ZIP archive from a reader.
580    ///
581    /// This also perform many validation checks on the archive to make sure
582    /// that is it well-formed and does not have dangerous or duplicated paths.
583    /// The validity of file contents is checked lazily when reading them.
584    ///
585    /// The exact rules around validation are not part of semver guaranties and
586    /// may change at every release.
587    ///
588    /// **The targets of symlinks are not checked yet here**, though they are
589    /// through `extract` and `extract_parallel`.
590    pub fn new(mut reader: R) -> io::Result<Self> {
591        let inner = RawArchive::new(&mut reader)?;
592
593        let names = inner
594            .entries()
595            .iter()
596            .enumerate()
597            .map(|(i, meta)| (meta.name().into(), i))
598            .collect();
599
600        Ok(Self {
601            inner,
602            names,
603            reader,
604        })
605    }
606
607    /// Gets the list of entries in the archive.
608    #[inline]
609    pub fn entries(&self) -> &[Metadata] {
610        &self.inner.entries
611    }
612
613    /// Gets a file by its index.
614    #[inline]
615    pub fn get_by_index(&mut self, index: usize) -> Option<File<'_, R>> {
616        let metadata = self.inner.entries().get(index)?;
617        Some(File {
618            metadata,
619            reader: &mut self.reader,
620        })
621    }
622
623    /// Gets a file by its name.
624    pub fn get_by_name(&mut self, name: &str) -> Option<File<'_, R>> {
625        let index = *self.names.get(name)?;
626        self.get_by_index(index)
627    }
628
629    /// Gets the index of a file in [`Self::entries`] by its name.
630    pub fn index_of(&self, name: &str) -> Option<usize> {
631        self.names.get(name).copied()
632    }
633
634    /// Gets the comment of the archive.
635    #[inline]
636    pub fn commment(&self) -> &[u8] {
637        &self.inner.comment
638    }
639
640    /// Extracts the archive to the given directory.
641    ///
642    /// The directory will be created if needed, but *not* its parent.
643    #[inline]
644    pub fn extract(&mut self, at: impl AsRef<std::path::Path>) -> io::Result<()> {
645        self.inner.extract(&mut self.reader, at.as_ref())
646    }
647
648    /// Extracts the archive to the given directory in parallel.
649    ///
650    /// The directory will be created if needed, but *not* its parent.
651    ///
652    /// The reader should implement [`sync_file::ReadAt`], like [`io::Cursor`]
653    /// or [`sync_file::SyncFile`].
654    #[cfg(feature = "parallel")]
655    #[inline]
656    pub fn parallel_extract(&self, at: impl AsRef<std::path::Path>) -> io::Result<()>
657    where
658        R: sync_file::ReadAt + sync_file::Size + Sync,
659    {
660        self.inner.parallel_extract(&self.reader, at.as_ref())
661    }
662
663    /// Gets a shared reference to the underlying reader.
664    #[inline]
665    pub fn get_ref(&self) -> &R {
666        &self.reader
667    }
668
669    /// Gets a mutable reference to the underlying reader.
670    #[inline]
671    pub fn get_mut(&mut self) -> &mut R {
672        &mut self.reader
673    }
674}
675
676/// A file in a ZIP archive.
677#[derive(Debug)]
678pub struct File<'a, R> {
679    metadata: &'a Metadata,
680    reader: &'a mut R,
681}
682
683impl<'a, R: BufRead + Seek> File<'a, R> {
684    /// Gets the metadata of the file.
685    ///
686    /// The lifetime of the returned reference is bound to the `Archive`, so it
687    /// can outlive `self`.
688    #[inline]
689    pub fn metadata(&self) -> &'a Metadata {
690        self.metadata
691    }
692
693    /// Returns a reader with the content of the file.
694    ///
695    /// Unsupported compression methods will return an error.
696    #[inline]
697    pub fn read(&mut self) -> io::Result<impl Read + '_> {
698        self.metadata.read(&mut *self.reader)
699    }
700
701    /// Returns a reader with the content of the file.
702    ///
703    /// Errors if the file is compressed, encrypted or corrupted. Is is not
704    /// necessary to use `Metadata::content_checker` on the result.
705    ///
706    /// It is useful if you know that the file is stored as-is and you want to
707    /// take advantage of the `BufReader` or the `Seek` implementation.
708    pub fn read_stored(self) -> io::Result<io::Take<&'a mut R>> {
709        self.metadata.read_stored(self.reader)
710    }
711
712    /// Returns a reader with the raw, compressed, content of the file.
713    ///
714    /// The uncompressed content should be checked with [`Metadata::content_checker`].
715    #[inline]
716    pub fn read_raw(&mut self) -> io::Result<io::Take<&mut R>> {
717        self.metadata.read_raw(self.reader)
718    }
719
720    /// Consumes self, returning the underlying reader.
721    ///
722    /// This reader can be used with [`Metadata::read_raw`] to read the raw,
723    /// compressed content of the file. The uncompressed content should then be
724    /// checked with [`Metadata::content_checker`].
725    pub fn into_reader(self) -> &'a mut R {
726        self.reader
727    }
728}