Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Compressor fixes #3142

Merged
merged 6 commits into from
Mar 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ Composer uses [pytest-codeblocks](https://github.com/nschloe/pytest-codeblocks)

To test your changes locally, run:

1. `make test` # run CPU tests
1. `make test-gpu` # run GPU tests
1. `cd docs && make doctest` # run doctests
* `make test` # run CPU tests
* `make test-gpu` # run GPU tests
* `cd docs && make doctest` # run doctests

Some of our checks test distributed training as well. To test these, run:

Expand All @@ -87,6 +87,17 @@ See the [Makefile](/Makefile) for more information.

If you want to run pre-commit hooks manually, which check for code formatting and type annotations, run `pre-commit run --all-files`

### Docker
To run the tests in the provided docker containers:

* `docker pull mosaicml/composer` (or an alternative image like `mosaicml/composer:latest_cpu`)
* `docker run --rm -v ./:/composer --user $(id -u):$(id -g) -it mosaicml/composer`
* from inside the container
* `cd /composer`
* `pip install -e .`
* `pytest <args>` or `make <args>` to run the desired tests


## Code Style & Typing

See the [Composer Style Guide](/STYLE_GUIDE.md) for guidelines on how to structure and format your code.
Expand Down
40 changes: 35 additions & 5 deletions composer/utils/compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,38 @@ def is_compressed_pt(filename: str) -> bool:


class CliCompressor:
"""Base class for data compression CLI tools."""
"""Base class for data compression CLI tools.

This class handles compression and decompression of data by piping it through
CLI compressor tools installed on the system. e.g. the `gzip` command for producing `.gz` files.

Example:
.. code-block:: python

compressor = CliCompressor('gz', 'gzip')

with compressor.compress('myfile.txt.gz') as f:
f.write('foo')

with compressor.decompress('myfile.txt.gz') as f:
assert f.read() == 'foo'

Args:
extension (str): The suffix used to identify files that the compressor supports (without a leading `.`).
cmd (str, optional): The name of the CLI tool that this compressor uses. Defaults to `None`, in which case
it is assumed that the tool name is the same as the extension.
"""

def __init__(self, extension: str, cmd: Optional[str] = None) -> None:
self.extension = extension
self.cmd = cmd if cmd is not None else extension

def __repr__(self) -> str:
return f'CliCompressor({self.extension!r}, {self.cmd!r})'

@property
def exists(self) -> bool:
"""Whether the CLI tool used by this compressor can be found."""
return shutil.which(self.cmd) is not None

def check_exists(self) -> None:
Expand All @@ -44,9 +68,10 @@ def _compress_cmd(self) -> List[str]:
return [self.cmd]

@contextmanager
def compress(self, filename: str) -> Iterator[IO[bytes]]:
def compress(self, out_filename: str) -> Iterator[IO[bytes]]:
"""Compress some data, saving to the given file."""
self.check_exists()
with open(filename, 'wb') as f:
with open(out_filename, 'wb') as f:
proc = subprocess.Popen(
self._compress_cmd(),
stdin=subprocess.PIPE,
Expand All @@ -55,21 +80,26 @@ def compress(self, filename: str) -> Iterator[IO[bytes]]:
assert proc.stdin is not None
yield proc.stdin
proc.stdin.close()
proc.wait()
returncode = proc.wait()
if returncode != 0:
raise IOError(f'failed to compress to "{out_filename}" using {self!r} (return code {returncode})')

def _decompress_cmd(self, filename: str) -> List[str]:
return [self.cmd, '-dc', filename]

@contextmanager
def decompress(self, in_filename: str) -> Iterator[IO[bytes]]:
"""Decompress the content of the given file, providing the output as a file-like object."""
self.check_exists()
proc = subprocess.Popen(
self._decompress_cmd(in_filename),
stdout=subprocess.PIPE,
)
assert proc.stdout is not None
yield proc.stdout
proc.wait()
returncode = proc.wait()
if returncode != 0:
raise IOError(f'failed to decompress "{in_filename}" using {self!r} (return code {returncode})')


def get_compressor(filename: str) -> CliCompressor:
Expand Down
1 change: 0 additions & 1 deletion docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,6 @@ RUN apt-get update && \
bzip2 \
gzip \
lz4 \
lzma \
lzop \
xz-utils \
zstd \
Expand Down
5 changes: 3 additions & 2 deletions tests/utils/test_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def test_compressor(tmp_path: Path, compressor: CliCompressor) -> None:
if not compressor.exists:
pytest.skip(reason=f'compressor {compressor.cmd} not found')

test_file = tmp_path / 'my_file'
test_file = tmp_path / f'my_file.{compressor.extension}'
test_data = b'foo foo foo'

with compressor.compress(str(test_file)) as f:
Expand All @@ -54,4 +54,5 @@ def test_compressor(tmp_path: Path, compressor: CliCompressor) -> None:
assert test_file.read_bytes() != test_data

with compressor.decompress(str(test_file)) as f:
assert f.read() == test_data
decompressed = f.read()
assert decompressed == test_data
Loading