2011-04-03 17:05:46 +02:00
|
|
|
"""Interface to the libbzip2 compression library.
|
|
|
|
|
|
|
|
This module provides a file interface, classes for incremental
|
|
|
|
(de)compression, and functions for one-shot (de)compression.
|
|
|
|
"""
|
|
|
|
|
2012-06-04 23:32:38 +02:00
|
|
|
__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor",
|
|
|
|
"open", "compress", "decompress"]
|
2011-04-03 17:05:46 +02:00
|
|
|
|
|
|
|
__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>"
|
|
|
|
|
2015-03-11 17:18:03 +02:00
|
|
|
from builtins import open as _builtin_open
|
2011-04-03 17:05:46 +02:00
|
|
|
import io
|
2016-10-02 20:07:06 +03:00
|
|
|
import os
|
2011-04-03 17:05:46 +02:00
|
|
|
import warnings
|
2015-04-11 00:31:01 +02:00
|
|
|
import _compression
|
2017-09-07 18:56:24 +02:00
|
|
|
from threading import RLock
|
2012-01-18 01:57:14 +02:00
|
|
|
|
2011-04-03 17:05:46 +02:00
|
|
|
from _bz2 import BZ2Compressor, BZ2Decompressor
|
|
|
|
|
|
|
|
|
|
|
|
_MODE_CLOSED = 0
|
|
|
|
_MODE_READ = 1
|
2015-04-11 00:31:01 +02:00
|
|
|
# Value 2 no longer used
|
2011-04-03 17:05:46 +02:00
|
|
|
_MODE_WRITE = 3
|
|
|
|
|
|
|
|
|
2015-04-11 00:31:01 +02:00
|
|
|
class BZ2File(_compression.BaseStream):
|
2011-04-03 17:05:46 +02:00
|
|
|
|
|
|
|
"""A file object providing transparent bzip2 (de)compression.
|
|
|
|
|
|
|
|
A BZ2File can act as a wrapper for an existing file object, or refer
|
|
|
|
directly to a named file on disk.
|
|
|
|
|
|
|
|
Note that BZ2File provides a *binary* file interface - data read is
|
|
|
|
returned as bytes, and data to be written should be given as bytes.
|
|
|
|
"""
|
|
|
|
|
2020-01-16 15:33:30 +01:00
|
|
|
def __init__(self, filename, mode="r", *, compresslevel=9):
|
2011-04-03 17:05:46 +02:00
|
|
|
"""Open a bzip2-compressed file.
|
|
|
|
|
2016-10-02 20:07:06 +03:00
|
|
|
If filename is a str, bytes, or PathLike object, it gives the
|
|
|
|
name of the file to be opened. Otherwise, it should be a file
|
|
|
|
object, which will be used to read or write the compressed data.
|
2011-04-03 17:05:46 +02:00
|
|
|
|
2012-10-08 20:31:34 +02:00
|
|
|
mode can be 'r' for reading (default), 'w' for (over)writing,
|
2013-10-19 00:11:06 +02:00
|
|
|
'x' for creating exclusively, or 'a' for appending. These can
|
|
|
|
equivalently be given as 'rb', 'wb', 'xb', and 'ab'.
|
2011-04-03 17:05:46 +02:00
|
|
|
|
2013-10-19 00:11:06 +02:00
|
|
|
If mode is 'w', 'x' or 'a', compresslevel can be a number between 1
|
2012-02-04 13:08:11 +02:00
|
|
|
and 9 specifying the level of compression: 1 produces the least
|
2011-04-03 17:05:46 +02:00
|
|
|
compression, and 9 (default) produces the most compression.
|
2012-02-04 13:08:11 +02:00
|
|
|
|
|
|
|
If mode is 'r', the input file may be the concatenation of
|
|
|
|
multiple compressed streams.
|
2011-04-03 17:05:46 +02:00
|
|
|
"""
|
|
|
|
# This lock must be recursive, so that BufferedIOBase's
|
2015-04-11 00:31:01 +02:00
|
|
|
# writelines() does not deadlock.
|
2012-01-18 01:57:14 +02:00
|
|
|
self._lock = RLock()
|
2011-04-03 17:05:46 +02:00
|
|
|
self._fp = None
|
|
|
|
self._closefp = False
|
|
|
|
self._mode = _MODE_CLOSED
|
|
|
|
|
|
|
|
if not (1 <= compresslevel <= 9):
|
|
|
|
raise ValueError("compresslevel must be between 1 and 9")
|
|
|
|
|
|
|
|
if mode in ("", "r", "rb"):
|
|
|
|
mode = "rb"
|
|
|
|
mode_code = _MODE_READ
|
|
|
|
elif mode in ("w", "wb"):
|
|
|
|
mode = "wb"
|
|
|
|
mode_code = _MODE_WRITE
|
2011-09-11 22:38:11 +02:00
|
|
|
self._compressor = BZ2Compressor(compresslevel)
|
2013-10-19 00:11:06 +02:00
|
|
|
elif mode in ("x", "xb"):
|
|
|
|
mode = "xb"
|
|
|
|
mode_code = _MODE_WRITE
|
|
|
|
self._compressor = BZ2Compressor(compresslevel)
|
2011-05-27 01:52:15 +02:00
|
|
|
elif mode in ("a", "ab"):
|
|
|
|
mode = "ab"
|
|
|
|
mode_code = _MODE_WRITE
|
2011-09-11 22:38:11 +02:00
|
|
|
self._compressor = BZ2Compressor(compresslevel)
|
2011-04-03 17:05:46 +02:00
|
|
|
else:
|
2012-10-08 19:20:49 +02:00
|
|
|
raise ValueError("Invalid mode: %r" % (mode,))
|
2011-04-03 17:05:46 +02:00
|
|
|
|
2016-10-02 20:07:06 +03:00
|
|
|
if isinstance(filename, (str, bytes, os.PathLike)):
|
2012-10-08 19:20:49 +02:00
|
|
|
self._fp = _builtin_open(filename, mode)
|
2011-04-03 17:05:46 +02:00
|
|
|
self._closefp = True
|
|
|
|
self._mode = mode_code
|
2012-06-04 23:31:20 +02:00
|
|
|
elif hasattr(filename, "read") or hasattr(filename, "write"):
|
|
|
|
self._fp = filename
|
2011-04-03 17:05:46 +02:00
|
|
|
self._mode = mode_code
|
|
|
|
else:
|
2016-10-02 20:07:06 +03:00
|
|
|
raise TypeError("filename must be a str, bytes, file or PathLike object")
|
2011-04-03 17:05:46 +02:00
|
|
|
|
2015-04-11 00:31:01 +02:00
|
|
|
if self._mode == _MODE_READ:
|
|
|
|
raw = _compression.DecompressReader(self._fp,
|
|
|
|
BZ2Decompressor, trailing_error=OSError)
|
|
|
|
self._buffer = io.BufferedReader(raw)
|
|
|
|
else:
|
|
|
|
self._pos = 0
|
|
|
|
|
2011-04-03 17:05:46 +02:00
|
|
|
def close(self):
|
|
|
|
"""Flush and close the file.
|
|
|
|
|
|
|
|
May be called more than once without error. Once the file is
|
|
|
|
closed, any other operation on it will raise a ValueError.
|
|
|
|
"""
|
|
|
|
with self._lock:
|
|
|
|
if self._mode == _MODE_CLOSED:
|
|
|
|
return
|
|
|
|
try:
|
2015-04-11 00:31:01 +02:00
|
|
|
if self._mode == _MODE_READ:
|
|
|
|
self._buffer.close()
|
2011-04-03 17:05:46 +02:00
|
|
|
elif self._mode == _MODE_WRITE:
|
|
|
|
self._fp.write(self._compressor.flush())
|
|
|
|
self._compressor = None
|
|
|
|
finally:
|
2011-04-03 17:08:49 +02:00
|
|
|
try:
|
2011-04-03 17:05:46 +02:00
|
|
|
if self._closefp:
|
|
|
|
self._fp.close()
|
|
|
|
finally:
|
|
|
|
self._fp = None
|
|
|
|
self._closefp = False
|
|
|
|
self._mode = _MODE_CLOSED
|
2015-04-11 00:31:01 +02:00
|
|
|
self._buffer = None
|
2011-04-03 17:05:46 +02:00
|
|
|
|
|
|
|
@property
|
|
|
|
def closed(self):
|
|
|
|
"""True if this file is closed."""
|
|
|
|
return self._mode == _MODE_CLOSED
|
|
|
|
|
|
|
|
def fileno(self):
|
|
|
|
"""Return the file descriptor for the underlying file."""
|
2011-11-30 17:39:30 +02:00
|
|
|
self._check_not_closed()
|
2011-04-03 17:05:46 +02:00
|
|
|
return self._fp.fileno()
|
|
|
|
|
|
|
|
def seekable(self):
|
|
|
|
"""Return whether the file supports seeking."""
|
2015-04-11 00:31:01 +02:00
|
|
|
return self.readable() and self._buffer.seekable()
|
2011-04-03 17:05:46 +02:00
|
|
|
|
|
|
|
def readable(self):
|
|
|
|
"""Return whether the file was opened for reading."""
|
2011-11-30 17:39:30 +02:00
|
|
|
self._check_not_closed()
|
2015-04-11 00:31:01 +02:00
|
|
|
return self._mode == _MODE_READ
|
2011-04-03 17:05:46 +02:00
|
|
|
|
|
|
|
def writable(self):
|
|
|
|
"""Return whether the file was opened for writing."""
|
2011-11-30 17:39:30 +02:00
|
|
|
self._check_not_closed()
|
2011-04-03 17:05:46 +02:00
|
|
|
return self._mode == _MODE_WRITE
|
|
|
|
|
|
|
|
def peek(self, n=0):
|
|
|
|
"""Return buffered data without advancing the file position.
|
|
|
|
|
|
|
|
Always returns at least one byte of data, unless at EOF.
|
|
|
|
The exact number of bytes returned is unspecified.
|
|
|
|
"""
|
|
|
|
with self._lock:
|
|
|
|
self._check_can_read()
|
2015-04-11 00:31:01 +02:00
|
|
|
# Relies on the undocumented fact that BufferedReader.peek()
|
|
|
|
# always returns at least one byte (except at EOF), independent
|
|
|
|
# of the value of n
|
|
|
|
return self._buffer.peek(n)
|
2011-04-03 17:05:46 +02:00
|
|
|
|
|
|
|
def read(self, size=-1):
|
|
|
|
"""Read up to size uncompressed bytes from the file.
|
|
|
|
|
|
|
|
If size is negative or omitted, read until EOF is reached.
|
|
|
|
Returns b'' if the file is already at EOF.
|
|
|
|
"""
|
|
|
|
with self._lock:
|
|
|
|
self._check_can_read()
|
2015-04-11 00:31:01 +02:00
|
|
|
return self._buffer.read(size)
|
2011-04-03 17:05:46 +02:00
|
|
|
|
|
|
|
def read1(self, size=-1):
|
2012-08-04 15:29:28 +02:00
|
|
|
"""Read up to size uncompressed bytes, while trying to avoid
|
2015-04-11 00:31:01 +02:00
|
|
|
making multiple reads from the underlying stream. Reads up to a
|
|
|
|
buffer's worth of data if size is negative.
|
2011-04-03 17:05:46 +02:00
|
|
|
|
|
|
|
Returns b'' if the file is at EOF.
|
|
|
|
"""
|
|
|
|
with self._lock:
|
|
|
|
self._check_can_read()
|
2015-04-11 00:31:01 +02:00
|
|
|
if size < 0:
|
|
|
|
size = io.DEFAULT_BUFFER_SIZE
|
|
|
|
return self._buffer.read1(size)
|
2011-04-03 17:05:46 +02:00
|
|
|
|
|
|
|
def readinto(self, b):
|
2015-04-11 00:31:01 +02:00
|
|
|
"""Read bytes into b.
|
2011-04-03 17:08:49 +02:00
|
|
|
|
2011-04-03 17:05:46 +02:00
|
|
|
Returns the number of bytes read (0 for EOF).
|
|
|
|
"""
|
|
|
|
with self._lock:
|
2015-04-11 00:31:01 +02:00
|
|
|
self._check_can_read()
|
|
|
|
return self._buffer.readinto(b)
|
2011-04-03 17:05:46 +02:00
|
|
|
|
|
|
|
def readline(self, size=-1):
|
|
|
|
"""Read a line of uncompressed bytes from the file.
|
|
|
|
|
|
|
|
The terminating newline (if present) is retained. If size is
|
|
|
|
non-negative, no more than size bytes will be read (in which
|
|
|
|
case the line may be incomplete). Returns b'' if already at EOF.
|
|
|
|
"""
|
2012-10-01 23:05:32 +02:00
|
|
|
if not isinstance(size, int):
|
|
|
|
if not hasattr(size, "__index__"):
|
|
|
|
raise TypeError("Integer argument expected")
|
|
|
|
size = size.__index__()
|
2011-04-03 17:05:46 +02:00
|
|
|
with self._lock:
|
2012-10-01 23:04:11 +02:00
|
|
|
self._check_can_read()
|
2015-04-11 00:31:01 +02:00
|
|
|
return self._buffer.readline(size)
|
2011-04-03 17:05:46 +02:00
|
|
|
|
|
|
|
def readlines(self, size=-1):
|
|
|
|
"""Read a list of lines of uncompressed bytes from the file.
|
|
|
|
|
|
|
|
size can be specified to control the number of lines read: no
|
|
|
|
further lines will be read once the total size of the lines read
|
|
|
|
so far equals or exceeds size.
|
|
|
|
"""
|
2012-10-01 23:05:32 +02:00
|
|
|
if not isinstance(size, int):
|
|
|
|
if not hasattr(size, "__index__"):
|
|
|
|
raise TypeError("Integer argument expected")
|
|
|
|
size = size.__index__()
|
2011-04-03 17:05:46 +02:00
|
|
|
with self._lock:
|
2015-04-11 00:31:01 +02:00
|
|
|
self._check_can_read()
|
|
|
|
return self._buffer.readlines(size)
|
2011-04-03 17:05:46 +02:00
|
|
|
|
|
|
|
def write(self, data):
|
|
|
|
"""Write a byte string to the file.
|
|
|
|
|
|
|
|
Returns the number of uncompressed bytes written, which is
|
|
|
|
always len(data). Note that due to buffering, the file on disk
|
|
|
|
may not reflect the data written until close() is called.
|
|
|
|
"""
|
|
|
|
with self._lock:
|
|
|
|
self._check_can_write()
|
|
|
|
compressed = self._compressor.compress(data)
|
|
|
|
self._fp.write(compressed)
|
|
|
|
self._pos += len(data)
|
|
|
|
return len(data)
|
|
|
|
|
|
|
|
def writelines(self, seq):
|
|
|
|
"""Write a sequence of byte strings to the file.
|
|
|
|
|
|
|
|
Returns the number of uncompressed bytes written.
|
|
|
|
seq can be any iterable yielding byte strings.
|
|
|
|
|
|
|
|
Line separators are not added between the written byte strings.
|
|
|
|
"""
|
|
|
|
with self._lock:
|
2015-04-11 00:31:01 +02:00
|
|
|
return _compression.BaseStream.writelines(self, seq)
|
|
|
|
|
|
|
|
def seek(self, offset, whence=io.SEEK_SET):
|
2011-04-03 17:05:46 +02:00
|
|
|
"""Change the file position.
|
|
|
|
|
|
|
|
The new position is specified by offset, relative to the
|
|
|
|
position indicated by whence. Values for whence are:
|
|
|
|
|
|
|
|
0: start of stream (default); offset must not be negative
|
|
|
|
1: current stream position
|
|
|
|
2: end of stream; offset must not be positive
|
|
|
|
|
|
|
|
Returns the new file position.
|
|
|
|
|
|
|
|
Note that seeking is emulated, so depending on the parameters,
|
|
|
|
this operation may be extremely slow.
|
|
|
|
"""
|
|
|
|
with self._lock:
|
|
|
|
self._check_can_seek()
|
2015-04-11 00:31:01 +02:00
|
|
|
return self._buffer.seek(offset, whence)
|
2011-04-03 17:05:46 +02:00
|
|
|
|
|
|
|
def tell(self):
|
|
|
|
"""Return the current file position."""
|
|
|
|
with self._lock:
|
|
|
|
self._check_not_closed()
|
2015-04-11 00:31:01 +02:00
|
|
|
if self._mode == _MODE_READ:
|
|
|
|
return self._buffer.tell()
|
2011-04-03 17:05:46 +02:00
|
|
|
return self._pos
|
|
|
|
|
|
|
|
|
2012-06-04 23:32:38 +02:00
|
|
|
def open(filename, mode="rb", compresslevel=9,
|
|
|
|
encoding=None, errors=None, newline=None):
|
|
|
|
"""Open a bzip2-compressed file in binary or text mode.
|
|
|
|
|
2016-10-02 20:07:06 +03:00
|
|
|
The filename argument can be an actual filename (a str, bytes, or
|
|
|
|
PathLike object), or an existing file object to read from or write
|
|
|
|
to.
|
2012-06-04 23:32:38 +02:00
|
|
|
|
2013-10-19 00:11:06 +02:00
|
|
|
The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or
|
|
|
|
"ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode.
|
|
|
|
The default mode is "rb", and the default compresslevel is 9.
|
2012-06-04 23:32:38 +02:00
|
|
|
|
2012-10-08 20:31:34 +02:00
|
|
|
For binary mode, this function is equivalent to the BZ2File
|
|
|
|
constructor: BZ2File(filename, mode, compresslevel). In this case,
|
|
|
|
the encoding, errors and newline arguments must not be provided.
|
2012-06-04 23:32:38 +02:00
|
|
|
|
|
|
|
For text mode, a BZ2File object is created, and wrapped in an
|
2012-10-08 20:31:34 +02:00
|
|
|
io.TextIOWrapper instance with the specified encoding, error
|
|
|
|
handling behavior, and line ending(s).
|
2012-06-04 23:32:38 +02:00
|
|
|
|
|
|
|
"""
|
|
|
|
if "t" in mode:
|
|
|
|
if "b" in mode:
|
|
|
|
raise ValueError("Invalid mode: %r" % (mode,))
|
|
|
|
else:
|
|
|
|
if encoding is not None:
|
|
|
|
raise ValueError("Argument 'encoding' not supported in binary mode")
|
|
|
|
if errors is not None:
|
|
|
|
raise ValueError("Argument 'errors' not supported in binary mode")
|
|
|
|
if newline is not None:
|
|
|
|
raise ValueError("Argument 'newline' not supported in binary mode")
|
|
|
|
|
|
|
|
bz_mode = mode.replace("t", "")
|
|
|
|
binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel)
|
|
|
|
|
|
|
|
if "t" in mode:
|
|
|
|
return io.TextIOWrapper(binary_file, encoding, errors, newline)
|
|
|
|
else:
|
|
|
|
return binary_file
|
|
|
|
|
|
|
|
|
2011-04-03 17:05:46 +02:00
|
|
|
def compress(data, compresslevel=9):
|
|
|
|
"""Compress a block of data.
|
|
|
|
|
|
|
|
compresslevel, if given, must be a number between 1 and 9.
|
|
|
|
|
|
|
|
For incremental compression, use a BZ2Compressor object instead.
|
|
|
|
"""
|
|
|
|
comp = BZ2Compressor(compresslevel)
|
|
|
|
return comp.compress(data) + comp.flush()
|
|
|
|
|
|
|
|
|
|
|
|
def decompress(data):
|
|
|
|
"""Decompress a block of data.
|
|
|
|
|
|
|
|
For incremental decompression, use a BZ2Decompressor object instead.
|
|
|
|
"""
|
2011-05-30 01:12:24 +02:00
|
|
|
results = []
|
2013-12-04 23:01:15 +01:00
|
|
|
while data:
|
2011-05-27 01:52:15 +02:00
|
|
|
decomp = BZ2Decompressor()
|
2013-12-04 23:01:15 +01:00
|
|
|
try:
|
|
|
|
res = decomp.decompress(data)
|
|
|
|
except OSError:
|
|
|
|
if results:
|
|
|
|
break # Leftover data is not a valid bzip2 stream; ignore it.
|
|
|
|
else:
|
|
|
|
raise # Error on the first iteration; bail out.
|
|
|
|
results.append(res)
|
2011-05-27 01:52:15 +02:00
|
|
|
if not decomp.eof:
|
|
|
|
raise ValueError("Compressed data ended before the "
|
|
|
|
"end-of-stream marker was reached")
|
|
|
|
data = decomp.unused_data
|
2013-12-04 23:01:15 +01:00
|
|
|
return b"".join(results)
|