diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..78ec220 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,19 @@ +name: ci + +on: + - push + - pull_request + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v1 + - uses: actions/setup-java@v3 + with: + distribution: 'temurin' + java-version: 8 + - name: Maven Install + run: mvn install -B -V -DskipTests -Dair.check.skip-all + - name: Maven Tests + run: mvn install -B -P ci diff --git a/README.md b/README.md index 2a5b49c..60cb7c6 100644 --- a/README.md +++ b/README.md @@ -1,219 +1,3 @@ # 🚚 MOVED 🚚 ### __Future development of Snappy without JNI has moved to [aircompressor](https://github.com/airlift/aircompressor)__ - -
- -# Snappy in Java - -This is a rewrite (port) of [Snappy](http://code.google.com/p/snappy/) written in -pure Java. This compression code produces a byte-for-byte exact copy of the output -created by the original C++ code, and extremely fast. - -# Performance - -The Snappy micro-benchmark has been ported, and can be used to measure -the performance of this code against the excellent Snappy JNI wrapper from -[xerial](http://code.google.com/p/snappy-java/). As you can see in the results -below, the pure Java port is 20-30% faster for block compress, 0-10% slower -for block uncompress, and 0-5% slower for round-trip block compression. These -results were run with Java 7 on a Core i7, 64-bit Mac. - -As a second more independent test, the performance has been measured using the -Ning JVM compression benchmark against Snappy JNI, and the pure Java -[Ning LZF](https://github.com/ning/compress) codec. The -[results](http://dain.github.com/snappy/) show that the pure Java Snappy is -20-30% faster than JNI Snappy for compression, and is typically 10-20% slower -for decompression. Both, the pure Java Snappy and JNI Snappy implementations -are faster that the Ning LZF codec. These results were run with Java 6 on a -Core i7, 64-bit Mac. - -The difference in performance between these two tests is due to the difference -in JVM version; Java 7 is consistently 5-10% faster than Java 6 in the -compression code. As with all benchmarks your mileage will vary, so test with -your actual use case. - - - -### Block Compress -

-                        JNI      Java         JNI        Java
-Input        Size  Compress  Compress  Throughput  Throughput  Change
----------------------------------------------------------------------
-html       102400     76.4%     76.4%   294.9MB/s   384.8MB/s  +30.5%  html
-urls       702087     49.1%     49.1%   178.7MB/s   226.5MB/s  +26.8%  urls
-jpg        126958      0.1%      0.1%     2.7GB/s     3.2GB/s  +17.4%  jpg (not compressible)
-pdf         94330     17.9%     17.9%   642.4MB/s   910.3MB/s  +41.7%  pdf
-html4      409600     76.4%     76.4%   289.2MB/s   377.3MB/s  +30.5%  html4
-cp          24603     51.9%     51.9%   166.4MB/s   233.7MB/s  +40.5%  cp
-c           11150     57.6%     57.6%   177.1MB/s   295.4MB/s  +66.8%  c
-lsp          3721     51.6%     51.6%   245.5MB/s   278.0MB/s  +13.2%  lsp
-xls       1029744     58.7%     58.7%   263.0MB/s   292.5MB/s  +11.2%  xls
-txt1       152089     40.2%     40.2%   116.8MB/s   163.1MB/s  +39.7%  txt1
-txt2       125179     35.9%     35.9%   112.5MB/s   153.4MB/s  +36.3%  txt2
-txt3       426754     42.9%     42.9%   123.3MB/s   169.8MB/s  +37.6%  txt3
-txt4       481861     31.7%     31.7%   107.8MB/s   146.2MB/s  +35.6%  txt4
-bin        513216     81.8%     81.8%   413.1MB/s   497.8MB/s  +20.5%  bin
-sum         38240     48.1%     48.1%   162.4MB/s   213.9MB/s  +31.7%  sum
-man          4227     40.6%     40.6%   194.6MB/s   241.7MB/s  +24.2%  man
-pb         118588     76.8%     76.8%   363.7MB/s   450.3MB/s  +23.8%  pb
-gaviota    184320     61.7%     61.7%   166.7MB/s   253.7MB/s  +52.2%  gaviota
-
- - -### Block Uncompress -

-                        JNI      Java         JNI        Java
-Input        Size  Compress  Compress  Throughput  Throughput  Change
----------------------------------------------------------------------
-html       102400     76.4%     76.4%     1.5GB/s     1.3GB/s  -12.2%  html
-urls       702087     49.1%     49.1%   969.2MB/s   827.5MB/s  -14.6%  urls
-jpg        126958      0.1%      0.1%    18.6GB/s    19.4GB/s   +4.2%  jpg (not compressible)
-pdf         94330     17.9%     17.9%     4.1GB/s     3.7GB/s   -8.8%  pdf
-html4      409600     76.4%     76.4%     1.5GB/s     1.2GB/s  -16.8%  html4
-cp          24603     51.9%     51.9%   965.2MB/s   956.0MB/s   -1.0%  cp
-c           11150     57.6%     57.6%   989.1MB/s   924.9MB/s   -6.5%  c
-lsp          3721     51.6%     51.6%   991.6MB/s   964.8MB/s   -2.7%  lsp
-xls       1029744     58.7%     58.7%   798.4MB/s   747.3MB/s   -6.4%  xls
-txt1       152089     40.2%     40.2%   643.8MB/s   580.8MB/s   -9.8%  txt1
-txt2       125179     35.9%     35.9%   610.0MB/s   549.6MB/s   -9.9%  txt2
-txt3       426754     42.9%     42.9%   683.8MB/s   614.4MB/s  -10.2%  txt3
-txt4       481861     31.7%     31.7%   565.4MB/s   505.5MB/s  -10.6%  txt4
-bin        513216     81.8%     81.8%     1.5GB/s     1.2GB/s  -20.4%  bin
-sum         38240     48.1%     48.1%   838.1MB/s   771.6MB/s   -7.9%  sum
-man          4227     40.6%     40.6%   856.9MB/s   847.2MB/s   -1.1%  man
-pb         118588     76.8%     76.8%     1.7GB/s     1.5GB/s  -12.9%  pb
-gaviota    184320     61.7%     61.7%   769.1MB/s   693.4MB/s   -9.9%  gaviota
-
- - -### Block Round Trip -

-                        JNI      Java         JNI        Java
-Input        Size  Compress  Compress  Throughput  Throughput  Change
----------------------------------------------------------------------
-html       102400     76.4%     76.4%   300.3MB/s   287.1MB/s   -4.4%  html
-urls       702087     49.1%     49.1%   182.7MB/s   177.0MB/s   -3.2%  urls
-jpg        126958      0.1%      0.1%     2.6GB/s     2.6GB/s   +1.1%  jpg (not compressible)
-pdf         94330     17.9%     17.9%   695.3MB/s   680.0MB/s   -2.2%  pdf
-html4      409600     76.4%     76.4%   296.4MB/s   282.1MB/s   -4.8%  html4
-cp          24603     51.9%     51.9%   177.0MB/s   172.5MB/s   -2.5%  cp
-c           11150     57.6%     57.6%   221.7MB/s   218.3MB/s   -1.5%  c
-lsp          3721     51.6%     51.6%   217.3MB/s   216.3MB/s   -0.5%  lsp
-xls       1029744     58.7%     58.7%   213.3MB/s   209.9MB/s   -1.6%  xls
-txt1       152089     40.2%     40.2%   129.4MB/s   126.3MB/s   -2.4%  txt1
-txt2       125179     35.9%     35.9%   121.7MB/s   118.8MB/s   -2.4%  txt2
-txt3       426754     42.9%     42.9%   135.2MB/s   132.8MB/s   -1.8%  txt3
-txt4       481861     31.7%     31.7%   115.2MB/s   113.0MB/s   -1.9%  txt4
-bin        513216     81.8%     81.8%   371.2MB/s   350.7MB/s   -5.5%  bin
-sum         38240     48.1%     48.1%   164.2MB/s   160.0MB/s   -2.6%  sum
-man          4227     40.6%     40.6%   184.8MB/s   185.3MB/s   +0.3%  man
-pb         118588     76.8%     76.8%   344.1MB/s   326.3MB/s   -5.2%  pb
-gaviota    184320     61.7%     61.7%   188.0MB/s   185.2MB/s   -1.5%  gaviota
-
- -# Stream Format - -There is no defined stream format for Snappy, but there is an effort to create -a common format with the Google Snappy project. - -The stream format used in this library has a couple of unique features not -found in the other Snappy stream formats. Like the other formats, the user -input is broken into blocks and each block is compressed. If the compressed -block is smaller that the user input, the compressed block is written, -otherwise the uncompressed original is written. This dramatically improves the -speed of uncompressible input such as JPG images. Additionally, a checksum of -the user input data for each block is written to the stream. This safety check -assures that the stream has not been corrupted in transit or by a bad Snappy -implementation. Finally, like gzip, compressed Snappy files can be -concatenated together without issue, since the input stream will ignore a -Snappy stream header in the middle of a stream. This makes combining files in -Hadoop and S3 trivial. - -The the SnappyOutputStream javadocs contain formal definition of the stream -format. - -## Stream Performance - -The streaming mode performance can not be directly compared to other -compression algorithms since most formats do not contain a checksum. The basic -streaming code is significantly faster that the Snappy JNI library due to -the completely unoptimized stream implementation in Snappy JNI, but once the -check sum is enabled the performance drops off by about 20%. - -### Stream Compress (no checksums) -

-                        JNI      Java         JNI        Java
-Input        Size  Compress  Compress  Throughput  Throughput  Change
----------------------------------------------------------------------
-html       102400     76.4%     76.4%   275.8MB/s   373.5MB/s  +35.4%  html
-urls       702087     49.1%     49.1%   176.5MB/s   225.2MB/s  +27.6%  urls
-jpg        126958      0.1%     -0.0%     1.7GB/s     2.0GB/s  +15.8%  jpg (not compressible)
-pdf         94330     17.8%     16.0%   557.2MB/s   793.2MB/s  +42.4%  pdf
-html4      409600     76.4%     76.4%   281.0MB/s   369.9MB/s  +31.7%  html4
-cp          24603     51.8%     51.8%   151.7MB/s   214.3MB/s  +41.3%  cp
-c           11150     57.4%     57.5%   149.1MB/s   243.3MB/s  +63.1%  c
-lsp          3721     51.1%     51.2%   141.3MB/s   181.1MB/s  +28.2%  lsp
-xls       1029744     58.6%     58.6%   253.9MB/s   290.5MB/s  +14.4%  xls
-txt1       152089     40.2%     40.2%   114.8MB/s   159.4MB/s  +38.8%  txt1
-txt2       125179     35.9%     35.9%   110.0MB/s   150.4MB/s  +36.7%  txt2
-txt3       426754     42.9%     42.9%   121.0MB/s   167.9MB/s  +38.8%  txt3
-txt4       481861     31.6%     31.6%   105.1MB/s   143.2MB/s  +36.2%  txt4
-bin        513216     81.8%     81.8%   387.7MB/s   484.5MB/s  +25.0%  bin
-sum         38240     48.1%     48.1%   153.0MB/s   203.1MB/s  +32.8%  sum
-man          4227     40.2%     40.3%   125.9MB/s   171.9MB/s  +36.5%  man
-pb         118588     76.8%     76.8%   342.2MB/s   431.4MB/s  +26.1%  pb
-gaviota    184320     61.7%     61.7%   161.1MB/s   246.1MB/s  +52.7%  gaviota
-
- - -### Stream Uncompress (no checksums) -

-                        JNI      Java         JNI        Java
-Input        Size  Compress  Compress  Throughput  Throughput  Change
----------------------------------------------------------------------
-html       102400     76.4%     76.4%     1.2GB/s     1.2GB/s   +0.4%  html
-urls       702087     49.1%     49.1%   853.9MB/s   786.6MB/s   -7.9%  urls
-jpg        126958      0.1%     -0.0%     3.0GB/s    10.3GB/s +239.0%  jpg (not compressible)
-pdf         94330     17.8%     16.0%     2.0GB/s     3.4GB/s  +71.5%  pdf
-html4      409600     76.4%     76.4%     1.2GB/s     1.1GB/s   -8.4%  html4
-cp          24603     51.8%     51.8%   785.2MB/s   905.6MB/s  +15.3%  cp
-c           11150     57.4%     57.5%   778.9MB/s   889.7MB/s  +14.2%  c
-lsp          3721     51.1%     51.2%   739.0MB/s   905.5MB/s  +22.5%  lsp
-xls       1029744     58.6%     58.6%   730.3MB/s   718.8MB/s   -1.6%  xls
-txt1       152089     40.2%     40.2%   582.4MB/s   559.0MB/s   -4.0%  txt1
-txt2       125179     35.9%     35.9%   540.7MB/s   526.4MB/s   -2.6%  txt2
-txt3       426754     42.9%     42.9%   620.5MB/s   583.9MB/s   -5.9%  txt3
-txt4       481861     31.6%     31.6%   519.4MB/s   487.0MB/s   -6.2%  txt4
-bin        513216     81.8%     81.8%     1.2GB/s     1.1GB/s  -11.6%  bin
-sum         38240     48.1%     48.1%   693.4MB/s   742.4MB/s   +7.1%  sum
-man          4227     40.2%     40.3%   637.3MB/s   784.3MB/s  +23.1%  man
-pb         118588     76.8%     76.8%     1.4GB/s     1.4GB/s   +0.4%  pb
-gaviota    184320     61.7%     61.7%   688.5MB/s   668.2MB/s   -3.0%  gaviota
-
- - -### Stream RoundTrip (no checksums) -

-                        JNI      Java         JNI        Java
-Input        Size  Compress  Compress  Throughput  Throughput  Change
----------------------------------------------------------------------
-html       102400     76.4%     76.4%   223.8MB/s   272.5MB/s  +21.8%  html
-urls       702087     49.1%     49.1%   142.8MB/s   174.1MB/s  +22.0%  urls
-jpg        126958      0.1%     -0.0%     1.1GB/s     1.6GB/s  +52.1%  jpg (not compressible)
-pdf         94330     17.8%     16.0%   421.9MB/s   610.1MB/s  +44.6%  pdf
-html4      409600     76.4%     76.4%   226.2MB/s   275.5MB/s  +21.8%  html4
-cp          24603     51.8%     51.8%   125.3MB/s   160.3MB/s  +27.9%  cp
-c           11150     57.4%     57.5%   125.1MB/s   183.2MB/s  +46.5%  c
-lsp          3721     51.1%     51.2%   130.6MB/s   149.5MB/s  +14.5%  lsp
-xls       1029744     58.6%     58.6%   188.2MB/s   206.1MB/s   +9.5%  xls
-txt1       152089     40.2%     40.2%    95.3MB/s   123.3MB/s  +29.4%  txt1
-txt2       125179     35.9%     35.9%    91.4MB/s   116.8MB/s  +27.9%  txt2
-txt3       426754     42.9%     42.9%   101.3MB/s   130.3MB/s  +28.6%  txt3
-txt4       481861     31.6%     31.6%    87.9MB/s   111.1MB/s  +26.3%  txt4
-bin        513216     81.8%     81.8%   294.7MB/s   337.9MB/s  +14.7%  bin
-sum         38240     48.1%     48.1%   122.9MB/s   152.9MB/s  +24.3%  sum
-man          4227     40.2%     40.3%   113.0MB/s   139.1MB/s  +23.1%  man
-pb         118588     76.8%     76.8%   269.5MB/s   313.8MB/s  +16.4%  pb
-gaviota    184320     61.7%     61.7%   131.1MB/s   180.3MB/s  +37.6%  gaviota
-
diff --git a/pom.xml b/pom.xml index feb27ca..f7e9d2c 100644 --- a/pom.xml +++ b/pom.xml @@ -62,6 +62,11 @@ true + + maven_central + Maven Central + https://repo.maven.apache.org/maven2/ + @@ -78,29 +83,36 @@ + org.apache.hadoop - hadoop-core - 0.20.2 + hadoop-common + 3.4.0 true provided com.google.guava guava - 13.0.1 + 33.2.0-jre test org.xerial.snappy snappy-java - 1.0.4.1 + 1.1.10.4 test org.testng testng - 6.0.1 + 7.5.1 test @@ -110,7 +122,7 @@ org.apache.maven.plugins maven-enforcer-plugin - 1.0 + 3.4.1 enforce-versions @@ -123,7 +135,7 @@ 3.0.0 - 1.6 + 1.8 @@ -134,47 +146,6 @@ org.apache.maven.plugins maven-source-plugin - - - org.apache.maven.plugins - maven-jar-plugin - 2.3.2 - - - binary - package - - jar - - - bin - - - org.iq80.snappy.Main - - - - - - - - - org.skife.maven - really-executable-jar-maven-plugin - 1.0.3 - - - package - - really-executable-jar - - - bin - - - - - @@ -212,8 +183,8 @@ maven-compiler-plugin 2.3.2 - 1.6 - 1.6 + 1.8 + 1.8 diff --git a/src/main/java/org/iq80/snappy/AbstractSnappyInputStream.java b/src/main/java/org/iq80/snappy/AbstractSnappyInputStream.java deleted file mode 100644 index 322bbde..0000000 --- a/src/main/java/org/iq80/snappy/AbstractSnappyInputStream.java +++ /dev/null @@ -1,307 +0,0 @@ -/* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.iq80.snappy; - -import java.io.EOFException; -import java.io.IOException; -import java.io.InputStream; -import java.util.Arrays; - -import static java.lang.Math.min; -import static org.iq80.snappy.SnappyInternalUtils.checkNotNull; -import static org.iq80.snappy.SnappyInternalUtils.checkPositionIndexes; -import static org.iq80.snappy.SnappyInternalUtils.readBytes; - -/** - * A common base class for frame based snappy input streams. - */ -abstract class AbstractSnappyInputStream - extends InputStream -{ - private final InputStream in; - private final byte[] frameHeader; - private final boolean verifyChecksums; - private final BufferRecycler recycler; - - /** - * A single frame read from the underlying {@link InputStream}. - */ - private byte[] input; - - /** - * The decompressed data from {@link #input}. - */ - private byte[] uncompressed; - - /** - * Indicates if this instance has been closed. - */ - private boolean closed; - - /** - * Indicates if we have reached the EOF on {@link #in}. - */ - private boolean eof; - - /** - * The position in {@link #input} to read to. - */ - private int valid; - - /** - * The next position to read from {@link #buffer}. - */ - private int position; - - /** - * Buffer is a reference to the real buffer of uncompressed data for the - * current block: uncompressed if the block is compressed, or input if it is - * not. - */ - private byte[] buffer; - - /** - * Creates a Snappy input stream to read data from the specified underlying - * input stream. - * - * @param in the underlying input stream - * @param verifyChecksums if true, checksums in input stream will be verified - * @param expectedHeader the expected stream header - */ - public AbstractSnappyInputStream(InputStream in, int maxBlockSize, int frameHeaderSize, boolean verifyChecksums, byte[] expectedHeader) - throws IOException - { - this.in = in; - this.verifyChecksums = verifyChecksums; - this.recycler = BufferRecycler.instance(); - allocateBuffersBasedOnSize(maxBlockSize + 5); - this.frameHeader = new byte[frameHeaderSize]; - - // stream must begin with stream header - byte[] actualHeader = new byte[expectedHeader.length]; - - int read = readBytes(in, actualHeader, 0, actualHeader.length); - if (read < expectedHeader.length) { - throw new EOFException("encountered EOF while reading stream header"); - } - if (!Arrays.equals(expectedHeader, actualHeader)) { - throw new IOException("invalid stream header"); - } - } - - private void allocateBuffersBasedOnSize(int size) - { - input = recycler.allocInputBuffer(size); - uncompressed = recycler.allocDecodeBuffer(size); - } - - @Override - public int read() - throws IOException - { - if (closed) { - return -1; - } - if (!ensureBuffer()) { - return -1; - } - return buffer[position++] & 0xFF; - } - - @Override - public int read(byte[] output, int offset, int length) - throws IOException - { - checkNotNull(output, "output is null"); - checkPositionIndexes(offset, offset + length, output.length); - if (closed) { - throw new IOException("Stream is closed"); - } - - if (length == 0) { - return 0; - } - if (!ensureBuffer()) { - return -1; - } - - int size = min(length, available()); - System.arraycopy(buffer, position, output, offset, size); - position += size; - return size; - } - - @Override - public int available() - throws IOException - { - if (closed) { - return 0; - } - return valid - position; - } - - @Override - public void close() - throws IOException - { - try { - in.close(); - } - finally { - if (!closed) { - closed = true; - recycler.releaseInputBuffer(input); - recycler.releaseDecodeBuffer(uncompressed); - } - } - } - - enum FrameAction - { - RAW, SKIP, UNCOMPRESS - } - - public static final class FrameMetaData - { - final int length; - final FrameAction frameAction; - - /** - * @param frameAction - * @param length - */ - public FrameMetaData(FrameAction frameAction, int length) - { - this.frameAction = frameAction; - this.length = length; - } - } - - public static final class FrameData - { - final int checkSum; - final int offset; - - public FrameData(int checkSum, int offset) - { - this.checkSum = checkSum; - this.offset = offset; - } - } - - private boolean ensureBuffer() - throws IOException - { - if (available() > 0) { - return true; - } - if (eof) { - return false; - } - - if (!readBlockHeader()) { - eof = true; - return false; - } - - // get action based on header - FrameMetaData frameMetaData = getFrameMetaData(frameHeader); - - if (FrameAction.SKIP == frameMetaData.frameAction) { - SnappyInternalUtils.skip(in, frameMetaData.length); - return ensureBuffer(); - } - - if (frameMetaData.length > input.length) { - allocateBuffersBasedOnSize(frameMetaData.length); - } - - int actualRead = readBytes(in, input, 0, frameMetaData.length); - if (actualRead != frameMetaData.length) { - throw new EOFException("unexpectd EOF when reading frame"); - } - - FrameData frameData = getFrameData(frameHeader, input, actualRead); - - if (FrameAction.UNCOMPRESS == frameMetaData.frameAction) { - int uncompressedLength = Snappy.getUncompressedLength(input, - frameData.offset); - - if (uncompressedLength > uncompressed.length) { - uncompressed = recycler.allocDecodeBuffer(uncompressedLength); - } - - this.valid = Snappy.uncompress(input, frameData.offset, actualRead - - frameData.offset, uncompressed, 0); - this.buffer = uncompressed; - this.position = 0; - } - else { - // we need to start reading at the offset - this.position = frameData.offset; - this.buffer = input; - // valid is until the end of the read data, regardless of offset - // indicating where we start - this.valid = actualRead; - } - - if (verifyChecksums) { - int actualCrc32c = Crc32C.maskedCrc32c(buffer, position, valid - position); - if (frameData.checkSum != actualCrc32c) { - throw new IOException("Corrupt input: invalid checksum"); - } - } - - return true; - } - - /** - * Use the content of the frameHeader to describe what type of frame we have - * and the action to take. - */ - protected abstract FrameMetaData getFrameMetaData(byte[] frameHeader) - throws IOException; - - /** - * Take the frame header and the content of the frame to describe metadata - * about the content. - * - * @param frameHeader The frame header. - * @param content The content of the of the frame. Content begins at index {@code 0}. - * @param length The length of the content. - * @return Metadata about the content of the frame. - */ - protected abstract FrameData getFrameData(byte[] frameHeader, byte[] content, int length); - - private boolean readBlockHeader() - throws IOException - { - int read = readBytes(in, frameHeader, 0, frameHeader.length); - - if (read == -1) { - return false; - } - - if (read < frameHeader.length) { - throw new EOFException("encountered EOF while reading block header"); - } - - return true; - } -} diff --git a/src/main/java/org/iq80/snappy/AbstractSnappyOutputStream.java b/src/main/java/org/iq80/snappy/AbstractSnappyOutputStream.java deleted file mode 100644 index abaf3fa..0000000 --- a/src/main/java/org/iq80/snappy/AbstractSnappyOutputStream.java +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.iq80.snappy; - -import java.io.IOException; -import java.io.OutputStream; - -import static org.iq80.snappy.Crc32C.maskedCrc32c; -import static org.iq80.snappy.Snappy.maxCompressedLength; -import static org.iq80.snappy.SnappyInternalUtils.checkArgument; -import static org.iq80.snappy.SnappyInternalUtils.checkNotNull; -import static org.iq80.snappy.SnappyInternalUtils.checkPositionIndexes; - -/** - * This is a base class supporting both the {@link SnappyOutputStream} and - * {@link SnappyFramedOutputStream}. - *

- *

- * Delegates writing the header bytes and individual frames to the specific - * implementations. Implementations may also override the crc32 checksum - * calculation. - *

- * - * @since 0.4 - */ -abstract class AbstractSnappyOutputStream - extends OutputStream -{ - private final BufferRecycler recycler; - private final int blockSize; - private final byte[] buffer; - private final byte[] outputBuffer; - private final double minCompressionRatio; - - private final OutputStream out; - - private int position; - private boolean closed; - - /** - * @param out The underlying {@link OutputStream} to write to. Must not be {@code null}. - * @param blockSize The block size (of raw data) to compress before writing frames to out. - * @param minCompressionRatio Defines the minimum compression ratio ({@code compressedLength / rawLength}) that must be achieved to - * write the compressed data. This must be in (0, 1.0]. - */ - public AbstractSnappyOutputStream(OutputStream out, int blockSize, double minCompressionRatio) - throws IOException - { - this.out = checkNotNull(out, "out is null"); - checkArgument(minCompressionRatio > 0 && minCompressionRatio <= 1.0, "minCompressionRatio %1s must be between (0,1.0].", minCompressionRatio); - this.minCompressionRatio = minCompressionRatio; - this.recycler = BufferRecycler.instance(); - this.blockSize = blockSize; - this.buffer = recycler.allocOutputBuffer(blockSize); - this.outputBuffer = recycler.allocEncodingBuffer(maxCompressedLength(blockSize)); - - writeHeader(out); - } - - /** - * Writes the implementation specific header or "marker bytes" to - * out. - * - * @param out The underlying {@link OutputStream}. - */ - protected abstract void writeHeader(OutputStream out) - throws IOException; - - @Override - public void write(int b) - throws IOException - { - if (closed) { - throw new IOException("Stream is closed"); - } - if (position >= blockSize) { - flushBuffer(); - } - buffer[position++] = (byte) b; - } - - @Override - public void write(byte[] input, int offset, int length) - throws IOException - { - checkNotNull(input, "input is null"); - checkPositionIndexes(offset, offset + length, input.length); - if (closed) { - throw new IOException("Stream is closed"); - } - - int free = blockSize - position; - - // easy case: enough free space in buffer for entire input - if (free >= length) { - copyToBuffer(input, offset, length); - return; - } - - // fill partial buffer as much as possible and flush - if (position > 0) { - copyToBuffer(input, offset, free); - flushBuffer(); - offset += free; - length -= free; - } - - // write remaining full blocks directly from input array - while (length >= blockSize) { - writeCompressed(input, offset, blockSize); - offset += blockSize; - length -= blockSize; - } - - // copy remaining partial block into now-empty buffer - copyToBuffer(input, offset, length); - } - - @Override - public final void flush() - throws IOException - { - if (closed) { - throw new IOException("Stream is closed"); - } - flushBuffer(); - out.flush(); - } - - @Override - public final void close() - throws IOException - { - if (closed) { - return; - } - try { - flush(); - out.close(); - } - finally { - closed = true; - recycler.releaseOutputBuffer(outputBuffer); - recycler.releaseEncodeBuffer(buffer); - } - } - - private void copyToBuffer(byte[] input, int offset, int length) - { - System.arraycopy(input, offset, buffer, position, length); - position += length; - } - - /** - * Compresses and writes out any buffered data. This does nothing if there - * is no currently buffered data. - */ - private void flushBuffer() - throws IOException - { - if (position > 0) { - writeCompressed(buffer, 0, position); - position = 0; - } - } - - /** - * {@link #calculateCRC32C(byte[], int, int) Calculates} the crc, compresses - * the data, determines if the compression ratio is acceptable and calls - * {@link #writeBlock(OutputStream, byte[], int, int, boolean, int)} to - * actually write the frame. - * - * @param input The byte[] containing the raw data to be compressed. - * @param offset The offset into input where the data starts. - * @param length The amount of data in input. - */ - private void writeCompressed(byte[] input, int offset, int length) - throws IOException - { - // crc is based on the user supplied input data - int crc32c = calculateCRC32C(input, offset, length); - - int compressed = Snappy - .compress(input, offset, length, outputBuffer, 0); - - // only use the compressed data if compression ratio is <= the minCompressionRatio - if (((double) compressed / (double) length) <= minCompressionRatio) { - writeBlock(out, outputBuffer, 0, compressed, true, crc32c); - } - else { - // otherwise use the uncompressed data. - writeBlock(out, input, offset, length, false, crc32c); - } - } - - /** - * Calculates a CRC32C checksum over the data. - *

- * This can be overridden to provider alternative implementations (such as - * returning 0 if checksums are not desired). - *

- * - * @return The CRC32 checksum. - */ - protected int calculateCRC32C(byte[] data, int offset, int length) - { - return maskedCrc32c(data, offset, length); - } - - /** - * Write a frame (block) to out. - * - * @param out The {@link OutputStream} to write to. - * @param data The data to write. - * @param offset The offset in data to start at. - * @param length The length of data to use. - * @param compressed Indicates if data is the compressed or raw content. - * This is based on whether the compression ratio desired is - * reached. - * @param crc32c The calculated checksum. - */ - protected abstract void writeBlock(OutputStream out, byte[] data, int offset, int length, boolean compressed, int crc32c) - throws IOException; -} diff --git a/src/main/java/org/iq80/snappy/BufferRecycler.java b/src/main/java/org/iq80/snappy/BufferRecycler.java deleted file mode 100644 index 6dfcc0d..0000000 --- a/src/main/java/org/iq80/snappy/BufferRecycler.java +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.iq80.snappy; - -import java.lang.ref.SoftReference; - -/** - * Simple helper class to encapsulate details of basic buffer - * recycling scheme, which helps a lot (as per profiling) for - * smaller encoding cases. - * - * @author tatu - */ -class BufferRecycler -{ - private static final int MIN_ENCODING_BUFFER = 4000; - - private static final int MIN_OUTPUT_BUFFER = 8000; - - /** - * This ThreadLocal contains a {@link java.lang.ref.SoftReference} - * to a {@link BufferRecycler} used to provide a low-cost - * buffer recycling for buffers we need for encoding, decoding. - */ - protected static final ThreadLocal> recyclerRef = new ThreadLocal>(); - - private byte[] inputBuffer; - private byte[] outputBuffer; - - private byte[] decodingBuffer; - private byte[] encodingBuffer; - - private short[] encodingHash; - - /** - * Accessor to get thread-local recycler instance - */ - public static BufferRecycler instance() - { - SoftReference ref = recyclerRef.get(); - - BufferRecycler bufferRecycler; - if (ref == null) { - bufferRecycler = null; - } - else { - bufferRecycler = ref.get(); - } - - if (bufferRecycler == null) { - bufferRecycler = new BufferRecycler(); - recyclerRef.set(new SoftReference(bufferRecycler)); - } - return bufferRecycler; - } - - public void clear() - { - inputBuffer = null; - outputBuffer = null; - decodingBuffer = null; - encodingBuffer = null; - encodingHash = null; - } - - /////////////////////////////////////////////////////////////////////// - // Buffers for encoding (output) - /////////////////////////////////////////////////////////////////////// - - public byte[] allocEncodingBuffer(int minSize) - { - byte[] buf = encodingBuffer; - if (buf == null || buf.length < minSize) { - buf = new byte[Math.max(minSize, MIN_ENCODING_BUFFER)]; - } - else { - encodingBuffer = null; - } - return buf; - } - - public void releaseEncodeBuffer(byte[] buffer) - { - if (encodingBuffer == null || buffer.length > encodingBuffer.length) { - encodingBuffer = buffer; - } - } - - public byte[] allocOutputBuffer(int minSize) - { - byte[] buf = outputBuffer; - if (buf == null || buf.length < minSize) { - buf = new byte[Math.max(minSize, MIN_OUTPUT_BUFFER)]; - } - else { - outputBuffer = null; - } - return buf; - } - - public void releaseOutputBuffer(byte[] buffer) - { - if (outputBuffer == null || (buffer != null && buffer.length > outputBuffer.length)) { - outputBuffer = buffer; - } - } - - public short[] allocEncodingHash(int suggestedSize) - { - short[] buf = encodingHash; - if (buf == null || buf.length < suggestedSize) { - buf = new short[suggestedSize]; - } - else { - encodingHash = null; - } - return buf; - } - - public void releaseEncodingHash(short[] buffer) - { - if (encodingHash == null || (buffer != null && buffer.length > encodingHash.length)) { - encodingHash = buffer; - } - } - - /////////////////////////////////////////////////////////////////////// - // Buffers for decoding (input) - /////////////////////////////////////////////////////////////////////// - - public byte[] allocInputBuffer(int minSize) - { - byte[] buf = inputBuffer; - if (buf == null || buf.length < minSize) { - buf = new byte[Math.max(minSize, MIN_OUTPUT_BUFFER)]; - } - else { - inputBuffer = null; - } - return buf; - } - - public void releaseInputBuffer(byte[] buffer) - { - if (inputBuffer == null || (buffer != null && buffer.length > inputBuffer.length)) { - inputBuffer = buffer; - } - } - - public byte[] allocDecodeBuffer(int size) - { - byte[] buf = decodingBuffer; - if (buf == null || buf.length < size) { - buf = new byte[size]; - } - else { - decodingBuffer = null; - } - return buf; - } - - public void releaseDecodeBuffer(byte[] buffer) - { - if (decodingBuffer == null || (buffer != null && buffer.length > decodingBuffer.length)) { - decodingBuffer = buffer; - } - } -} diff --git a/src/main/java/org/iq80/snappy/CorruptionException.java b/src/main/java/org/iq80/snappy/CorruptionException.java index 24e797f..d91a637 100644 --- a/src/main/java/org/iq80/snappy/CorruptionException.java +++ b/src/main/java/org/iq80/snappy/CorruptionException.java @@ -1,8 +1,4 @@ /* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at @@ -20,22 +16,21 @@ public class CorruptionException extends RuntimeException { - public CorruptionException() - { - } + private final long offset; - public CorruptionException(String message) + public CorruptionException(long offset) { - super(message); + this(offset, "Malformed input"); } - public CorruptionException(String message, Throwable cause) + public CorruptionException(long offset, String reason) { - super(message, cause); + super(reason + ": offset=" + offset); + this.offset = offset; } - public CorruptionException(Throwable cause) + public long getOffset() { - super(cause); + return offset; } } diff --git a/src/main/java/org/iq80/snappy/Crc32C.java b/src/main/java/org/iq80/snappy/Crc32C.java index cced9b0..9679946 100644 --- a/src/main/java/org/iq80/snappy/Crc32C.java +++ b/src/main/java/org/iq80/snappy/Crc32C.java @@ -47,24 +47,15 @@ public static int maskedCrc32c(byte[] data, int offset, int length) * Return a masked representation of crc. *

* Motivation: it is problematic to compute the CRC of a string that - * contains embedded CRCs. Therefore we recommend that CRCs stored + * contains embedded CRCs. Therefore, we recommend that CRCs stored * somewhere (e.g., in files) should be masked before being stored. */ - public static int mask(int crc) + private static int mask(int crc) { // Rotate right by 15 bits and add a constant. return ((crc >>> 15) | (crc << 17)) + MASK_DELTA; } - /** - * Return the crc whose masked representation is masked_crc. - */ - public static int unmask(int maskedCrc) - { - int rot = maskedCrc - MASK_DELTA; - return ((rot >>> 17) | (rot << 15)); - } - /** * the current CRC value, bit-flipped */ @@ -73,17 +64,17 @@ public static int unmask(int maskedCrc) /** * Create a new PureJavaCrc32 object. */ - public Crc32C() + private Crc32C() { reset(); } - public int getMaskedValue() + private int getMaskedValue() { return mask(getIntValue()); } - public int getIntValue() + private int getIntValue() { return ~crc; } @@ -107,9 +98,13 @@ public void update(byte[] b, int off, int len) int localCrc = crc; while (len > 7) { int c0 = b[off++] ^ localCrc; - int c1 = b[off++] ^ (localCrc >>>= 8); - int c2 = b[off++] ^ (localCrc >>>= 8); - int c3 = b[off++] ^ (localCrc >>>= 8); + localCrc >>>= 8; + int c1 = b[off++] ^ localCrc; + localCrc >>>= 8; + int c2 = b[off++] ^ localCrc; + localCrc >>>= 8; + int c3 = b[off++] ^ localCrc; + localCrc = (T8_7[c0 & 0xff] ^ T8_6[c1 & 0xff]) ^ (T8_5[c2 & 0xff] ^ T8_4[c3 & 0xff]); @@ -137,7 +132,7 @@ public void update(int b) // java -cp build/test/classes/:build/classes/ \ // org.apache.hadoop.util.TestPureJavaCrc32\$Table 82F63B78 - static final int[] T8_0 = new int[] { + private static final int[] T8_0 = new int[] { 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB, 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, @@ -203,7 +198,7 @@ public void update(int b) 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351 }; - static final int[] T8_1 = new int[] { + private static final int[] T8_1 = new int[] { 0x00000000, 0x13A29877, 0x274530EE, 0x34E7A899, 0x4E8A61DC, 0x5D28F9AB, 0x69CF5132, 0x7A6DC945, 0x9D14C3B8, 0x8EB65BCF, 0xBA51F356, 0xA9F36B21, @@ -269,7 +264,7 @@ public void update(int b) 0xD98EEDC6, 0xCA2C75B1, 0xFECBDD28, 0xED69455F, 0x97048C1A, 0x84A6146D, 0xB041BCF4, 0xA3E32483 }; - static final int[] T8_2 = new int[] { + private static final int[] T8_2 = new int[] { 0x00000000, 0xA541927E, 0x4F6F520D, 0xEA2EC073, 0x9EDEA41A, 0x3B9F3664, 0xD1B1F617, 0x74F06469, 0x38513EC5, 0x9D10ACBB, 0x773E6CC8, 0xD27FFEB6, @@ -335,7 +330,7 @@ public void update(int b) 0xE5F54FC1, 0x40B4DDBF, 0xAA9A1DCC, 0x0FDB8FB2, 0x7B2BEBDB, 0xDE6A79A5, 0x3444B9D6, 0x91052BA8 }; - static final int[] T8_3 = new int[] { + private static final int[] T8_3 = new int[] { 0x00000000, 0xDD45AAB8, 0xBF672381, 0x62228939, 0x7B2231F3, 0xA6679B4B, 0xC4451272, 0x1900B8CA, 0xF64463E6, 0x2B01C95E, 0x49234067, 0x9466EADF, @@ -401,7 +396,7 @@ public void update(int b) 0x31035088, 0xEC46FA30, 0x8E647309, 0x5321D9B1, 0x4A21617B, 0x9764CBC3, 0xF54642FA, 0x2803E842 }; - static final int[] T8_4 = new int[] { + private static final int[] T8_4 = new int[] { 0x00000000, 0x38116FAC, 0x7022DF58, 0x4833B0F4, 0xE045BEB0, 0xD854D11C, 0x906761E8, 0xA8760E44, 0xC5670B91, 0xFD76643D, 0xB545D4C9, 0x8D54BB65, @@ -467,7 +462,7 @@ public void update(int b) 0x081E60E7, 0x300F0F4B, 0x783CBFBF, 0x402DD013, 0xE85BDE57, 0xD04AB1FB, 0x9879010F, 0xA0686EA3 }; - static final int[] T8_5 = new int[] { + private static final int[] T8_5 = new int[] { 0x00000000, 0xEF306B19, 0xDB8CA0C3, 0x34BCCBDA, 0xB2F53777, 0x5DC55C6E, 0x697997B4, 0x8649FCAD, 0x6006181F, 0x8F367306, 0xBB8AB8DC, 0x54BAD3C5, @@ -533,7 +528,7 @@ public void update(int b) 0x37F2D291, 0xD8C2B988, 0xEC7E7252, 0x034E194B, 0x8507E5E6, 0x6A378EFF, 0x5E8B4525, 0xB1BB2E3C }; - static final int[] T8_6 = new int[] { + private static final int[] T8_6 = new int[] { 0x00000000, 0x68032CC8, 0xD0065990, 0xB8057558, 0xA5E0C5D1, 0xCDE3E919, 0x75E69C41, 0x1DE5B089, 0x4E2DFD53, 0x262ED19B, 0x9E2BA4C3, 0xF628880B, @@ -599,7 +594,7 @@ public void update(int b) 0x60F48DC6, 0x08F7A10E, 0xB0F2D456, 0xD8F1F89E, 0xC5144817, 0xAD1764DF, 0x15121187, 0x7D113D4F }; - static final int[] T8_7 = new int[] { + private static final int[] T8_7 = new int[] { 0x00000000, 0x493C7D27, 0x9278FA4E, 0xDB448769, 0x211D826D, 0x6821FF4A, 0xB3657823, 0xFA590504, 0x423B04DA, 0x0B0779FD, 0xD043FE94, 0x997F83B3, diff --git a/src/main/java/org/iq80/snappy/HadoopSnappyCodec.java b/src/main/java/org/iq80/snappy/HadoopSnappyCodec.java index 7ec8b81..aa942e5 100644 --- a/src/main/java/org/iq80/snappy/HadoopSnappyCodec.java +++ b/src/main/java/org/iq80/snappy/HadoopSnappyCodec.java @@ -17,69 +17,95 @@ */ package org.iq80.snappy; +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionInputStream; import org.apache.hadoop.io.compress.CompressionOutputStream; import org.apache.hadoop.io.compress.Compressor; import org.apache.hadoop.io.compress.Decompressor; +import org.apache.hadoop.io.compress.DoNotPool; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import static org.apache.hadoop.fs.CommonConfigurationKeys.IO_COMPRESSION_CODEC_SNAPPY_BUFFERSIZE_DEFAULT; +import static org.apache.hadoop.fs.CommonConfigurationKeys.IO_COMPRESSION_CODEC_SNAPPY_BUFFERSIZE_KEY; + public class HadoopSnappyCodec - implements CompressionCodec + implements Configurable, CompressionCodec { + private Configuration conf; + + @Override + public Configuration getConf() + { + return conf; + } + + @Override + public void setConf(Configuration conf) + { + this.conf = conf; + } + @Override public CompressionOutputStream createOutputStream(OutputStream outputStream) throws IOException { - return new SnappyCompressionOutputStream(outputStream); + return new HadoopSnappyOutputStream(outputStream, getBufferSize()); } @Override public CompressionOutputStream createOutputStream(OutputStream outputStream, Compressor compressor) throws IOException { - throw new UnsupportedOperationException("Snappy Compressor is not supported"); + if (!(compressor instanceof HadoopSnappyCompressor)) { + throw new IllegalArgumentException("Compressor is not the Snappy decompressor"); + } + return new HadoopSnappyOutputStream(outputStream, getBufferSize()); } @Override public Class getCompressorType() { - throw new UnsupportedOperationException("Snappy Compressor is not supported"); + return HadoopSnappyCompressor.class; } @Override public Compressor createCompressor() { - throw new UnsupportedOperationException("Snappy Compressor is not supported"); + return new HadoopSnappyCompressor(); } @Override public CompressionInputStream createInputStream(InputStream inputStream) throws IOException { - return new SnappyCompressionInputStream(inputStream); + return new HadoopSnappyInputStream(inputStream); } @Override - public CompressionInputStream createInputStream(InputStream inputStream, Decompressor decompressor) + public CompressionInputStream createInputStream(InputStream in, Decompressor decompressor) throws IOException { - throw new UnsupportedOperationException("Snappy Decompressor is not supported"); + if (!(decompressor instanceof HadoopSnappyDecompressor)) { + throw new IllegalArgumentException("Decompressor is not the Snappy decompressor"); + } + return new HadoopSnappyInputStream(in); } @Override public Class getDecompressorType() { - throw new UnsupportedOperationException("Snappy Decompressor is not supported"); + return HadoopSnappyDecompressor.class; } @Override public Decompressor createDecompressor() { - throw new UnsupportedOperationException("Snappy Decompressor is not supported"); + return new HadoopSnappyDecompressor(); } @Override @@ -88,72 +114,141 @@ public String getDefaultExtension() return ".snappy"; } - private static class SnappyCompressionOutputStream - extends CompressionOutputStream + private int getBufferSize() { - public SnappyCompressionOutputStream(OutputStream outputStream) - throws IOException + // Favor using the configured buffer size. This is not as critical for Snappy + // since Snappy always writes the compressed chunk size, so we always know the + // correct buffer size to create. + int maxUncompressedLength; + if (conf != null) { + maxUncompressedLength = conf.getInt(IO_COMPRESSION_CODEC_SNAPPY_BUFFERSIZE_KEY, IO_COMPRESSION_CODEC_SNAPPY_BUFFERSIZE_DEFAULT); + } + else { + maxUncompressedLength = IO_COMPRESSION_CODEC_SNAPPY_BUFFERSIZE_DEFAULT; + } + return maxUncompressedLength; + } + + /** + * No Hadoop code seems to actually use the compressor, so just return a dummy one so the createOutputStream method + * with a compressor can function. This interface can be implemented if needed. + */ + @DoNotPool + private static class HadoopSnappyCompressor + implements Compressor + { + @Override + public void setInput(byte[] b, int off, int len) { - super(new SnappyOutputStream(outputStream)); + throw new UnsupportedOperationException("Snappy block compressor is not supported"); } @Override - public void write(byte[] b, int off, int len) - throws IOException + public boolean needsInput() { - out.write(b, off, len); + throw new UnsupportedOperationException("Snappy block compressor is not supported"); + } + + @Override + public void setDictionary(byte[] b, int off, int len) + { + throw new UnsupportedOperationException("Snappy block compressor is not supported"); + } + + @Override + public long getBytesRead() + { + throw new UnsupportedOperationException("Snappy block compressor is not supported"); + } + + @Override + public long getBytesWritten() + { + throw new UnsupportedOperationException("Snappy block compressor is not supported"); } @Override public void finish() - throws IOException { - out.flush(); + throw new UnsupportedOperationException("Snappy block compressor is not supported"); } @Override - public void resetState() - throws IOException + public boolean finished() { - out.flush(); + throw new UnsupportedOperationException("Snappy block compressor is not supported"); } @Override - public void write(int b) - throws IOException + public int compress(byte[] b, int off, int len) { - out.write(b); + throw new UnsupportedOperationException("Snappy block compressor is not supported"); } + + @Override + public void reset() {} + + @Override + public void end() {} + + @Override + public void reinit(Configuration conf) {} } - private static class SnappyCompressionInputStream - extends CompressionInputStream + /** + * No Hadoop code seems to actually use the decompressor, so just return a dummy one so the createInputStream method + * with a decompressor can function. This interface can be implemented if needed. + */ + @DoNotPool + private static class HadoopSnappyDecompressor + implements Decompressor { - public SnappyCompressionInputStream(InputStream inputStream) - throws IOException + @Override + public void setInput(byte[] b, int off, int len) + { + throw new UnsupportedOperationException("Snappy block decompressor is not supported"); + } + + @Override + public boolean needsInput() { - super(new SnappyInputStream(inputStream)); + throw new UnsupportedOperationException("Snappy block decompressor is not supported"); } @Override - public int read(byte[] b, int off, int len) - throws IOException + public void setDictionary(byte[] b, int off, int len) { - return in.read(b, off, len); + throw new UnsupportedOperationException("Snappy block decompressor is not supported"); } @Override - public void resetState() - throws IOException + public boolean needsDictionary() { - throw new UnsupportedOperationException("resetState not supported for Snappy"); + throw new UnsupportedOperationException("Snappy block decompressor is not supported"); } @Override - public int read() - throws IOException + public boolean finished() { - return in.read(); + throw new UnsupportedOperationException("Snappy block decompressor is not supported"); } + + @Override + public int decompress(byte[] b, int off, int len) + { + throw new UnsupportedOperationException("Snappy block decompressor is not supported"); + } + + @Override + public int getRemaining() + { + throw new UnsupportedOperationException("Snappy block decompressor is not supported"); + } + + @Override + public void reset() {} + + @Override + public void end() {} } } diff --git a/src/main/java/org/iq80/snappy/HadoopSnappyInputStream.java b/src/main/java/org/iq80/snappy/HadoopSnappyInputStream.java new file mode 100644 index 0000000..c3e0706 --- /dev/null +++ b/src/main/java/org/iq80/snappy/HadoopSnappyInputStream.java @@ -0,0 +1,163 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.iq80.snappy; + +import org.apache.hadoop.io.compress.CompressionInputStream; + +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; + +import static org.iq80.snappy.SnappyConstants.SIZE_OF_LONG; + +class HadoopSnappyInputStream + extends CompressionInputStream +{ + private final InputStream in; + + private int uncompressedBlockLength; + private byte[] uncompressedChunk = new byte[0]; + private int uncompressedChunkOffset; + private int uncompressedChunkLength; + + private byte[] compressed = new byte[0]; + + public HadoopSnappyInputStream(InputStream in) + throws IOException + { + super(in); + this.in = in; + } + + @Override + public int read() + throws IOException + { + if (uncompressedChunkOffset >= uncompressedChunkLength) { + readNextChunk(uncompressedChunk, 0, uncompressedChunk.length); + if (uncompressedChunkLength == 0) { + return -1; + } + } + return uncompressedChunk[uncompressedChunkOffset++] & 0xFF; + } + + @Override + public int read(byte[] output, int offset, int length) + throws IOException + { + if (uncompressedChunkOffset >= uncompressedChunkLength) { + boolean directDecompress = readNextChunk(output, offset, length); + if (uncompressedChunkLength == 0) { + return -1; + } + if (directDecompress) { + uncompressedChunkOffset += uncompressedChunkLength; + return uncompressedChunkLength; + } + } + int size = Math.min(length, uncompressedChunkLength - uncompressedChunkOffset); + System.arraycopy(uncompressedChunk, uncompressedChunkOffset, output, offset, size); + uncompressedChunkOffset += size; + return size; + } + + @Override + public void resetState() + { + uncompressedBlockLength = 0; + uncompressedChunkOffset = 0; + uncompressedChunkLength = 0; + } + + private boolean readNextChunk(byte[] userBuffer, int userOffset, int userLength) + throws IOException + { + uncompressedBlockLength -= uncompressedChunkOffset; + uncompressedChunkOffset = 0; + uncompressedChunkLength = 0; + while (uncompressedBlockLength == 0) { + uncompressedBlockLength = readBigEndianInt(); + if (uncompressedBlockLength == -1) { + uncompressedBlockLength = 0; + return false; + } + } + + int compressedChunkLength = readBigEndianInt(); + if (compressedChunkLength == -1) { + return false; + } + + if (compressed.length < compressedChunkLength) { + // over allocate buffer which makes decompression easier + compressed = new byte[compressedChunkLength + SIZE_OF_LONG]; + } + readInput(compressedChunkLength, compressed); + + uncompressedChunkLength = Snappy.getUncompressedLength(compressed, 0); + if (uncompressedChunkLength > uncompressedBlockLength) { + throw new IOException("Chunk uncompressed size is greater than block size"); + } + + boolean directUncompress = true; + if (uncompressedChunkLength > userLength) { + if (uncompressedChunk.length < uncompressedChunkLength) { + // over allocate buffer which makes decompression easier + uncompressedChunk = new byte[uncompressedChunkLength + SIZE_OF_LONG]; + } + directUncompress = false; + userBuffer = uncompressedChunk; + userOffset = 0; + userLength = uncompressedChunk.length; + } + + int bytes = Snappy.uncompress(compressed, 0, compressedChunkLength, userBuffer, userOffset, userLength); + if (uncompressedChunkLength != bytes) { + throw new IOException("Expected to read " + uncompressedChunkLength + " bytes, but data only contained " + bytes + " bytes"); + } + return directUncompress; + } + + private void readInput(int length, byte[] buffer) + throws IOException + { + int offset = 0; + while (offset < length) { + int size = in.read(buffer, offset, length - offset); + if (size == -1) { + throw new EOFException("encountered EOF while reading block data"); + } + offset += size; + } + } + + private int readBigEndianInt() + throws IOException + { + int b1 = in.read(); + if (b1 < 0) { + return -1; + } + int b2 = in.read(); + int b3 = in.read(); + int b4 = in.read(); + + // If any of the other bits are negative, the stream it truncated + if ((b2 | b3 | b4) < 0) { + throw new IOException("Stream is truncated"); + } + return ((b1 << 24) + (b2 << 16) + (b3 << 8) + (b4)); + } +} diff --git a/src/main/java/org/iq80/snappy/HadoopSnappyOutputStream.java b/src/main/java/org/iq80/snappy/HadoopSnappyOutputStream.java new file mode 100644 index 0000000..9f3fdd3 --- /dev/null +++ b/src/main/java/org/iq80/snappy/HadoopSnappyOutputStream.java @@ -0,0 +1,115 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.iq80.snappy; + +import org.apache.hadoop.io.compress.CompressionOutputStream; + +import java.io.IOException; +import java.io.OutputStream; + +import static org.iq80.snappy.SnappyConstants.SIZE_OF_LONG; + +class HadoopSnappyOutputStream + extends CompressionOutputStream +{ + private final byte[] inputBuffer; + private final int inputMaxSize; + private int inputOffset; + + private final byte[] outputBuffer; + + public HadoopSnappyOutputStream(OutputStream out, int bufferSize) + { + super(out); + inputBuffer = new byte[bufferSize]; + // leave extra space free at end of buffers to make compression (slightly) faster + inputMaxSize = inputBuffer.length - compressionOverhead(bufferSize); + outputBuffer = new byte[Snappy.maxCompressedLength(inputMaxSize) + SIZE_OF_LONG]; + } + + @Override + public void write(int b) + throws IOException + { + inputBuffer[inputOffset++] = (byte) b; + if (inputOffset >= inputMaxSize) { + writeNextChunk(inputBuffer, 0, this.inputOffset); + } + } + + @Override + public void write(byte[] buffer, int offset, int length) + throws IOException + { + while (length > 0) { + int chunkSize = Math.min(length, inputMaxSize - inputOffset); + // favor writing directly from the user buffer to avoid the extra copy + if (inputOffset == 0 && length > inputMaxSize) { + writeNextChunk(buffer, offset, chunkSize); + } + else { + System.arraycopy(buffer, offset, inputBuffer, inputOffset, chunkSize); + inputOffset += chunkSize; + + if (inputOffset >= inputMaxSize) { + writeNextChunk(inputBuffer, 0, inputOffset); + } + } + length -= chunkSize; + offset += chunkSize; + } + } + + @Override + public void finish() + throws IOException + { + if (inputOffset > 0) { + writeNextChunk(inputBuffer, 0, this.inputOffset); + } + } + + @Override + public void resetState() + throws IOException + { + finish(); + } + + private void writeNextChunk(byte[] input, int inputOffset, int inputLength) + throws IOException + { + int compressedSize = Snappy.compress(input, inputOffset, inputLength, outputBuffer, 0); + + writeBigEndianInt(inputLength); + writeBigEndianInt(compressedSize); + out.write(outputBuffer, 0, compressedSize); + + this.inputOffset = 0; + } + + private void writeBigEndianInt(int value) + throws IOException + { + out.write(value >>> 24); + out.write(value >>> 16); + out.write(value >>> 8); + out.write(value); + } + + private static int compressionOverhead(int size) + { + return (size / 6) + 32; + } +} \ No newline at end of file diff --git a/src/main/java/org/iq80/snappy/IncompatibleJvmException.java b/src/main/java/org/iq80/snappy/IncompatibleJvmException.java new file mode 100644 index 0000000..e1dc6c8 --- /dev/null +++ b/src/main/java/org/iq80/snappy/IncompatibleJvmException.java @@ -0,0 +1,23 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.iq80.snappy; + +public class IncompatibleJvmException + extends RuntimeException +{ + public IncompatibleJvmException(String message) + { + super(message); + } +} diff --git a/src/main/java/org/iq80/snappy/Main.java b/src/main/java/org/iq80/snappy/Main.java deleted file mode 100644 index 9de9ed4..0000000 --- a/src/main/java/org/iq80/snappy/Main.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.iq80.snappy; - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; - -public class Main -{ - public static void main(String[] args) - throws Exception - { - if ((args.length == 1) && (args[0].equals("-c"))) { - compress(); - } - else if ((args.length == 1) && (args[0].equals("-d"))) { - uncompress(); - } - else { - usage(); - } - } - - private static void usage() - { - System.err.println("Usage: java -jar snappy.jar OPTION"); - System.err.println("Compress or uncompress with Snappy."); - System.err.println(); - System.err.println(" -c compress from stdin to stdout"); - System.err.println(" -d uncompress from stdin to stdout"); - System.exit(100); - } - - private static void compress() - throws IOException - { - copy(System.in, new SnappyOutputStream(System.out)); - } - - private static void uncompress() - throws IOException - { - copy(new SnappyInputStream(System.in), System.out); - } - - private static void copy(InputStream in, OutputStream out) - throws IOException - { - byte[] buf = new byte[4096]; - while (true) { - int r = in.read(buf); - if (r == -1) { - out.close(); - in.close(); - return; - } - out.write(buf, 0, r); - } - } -} diff --git a/src/main/java/org/iq80/snappy/SlowMemory.java b/src/main/java/org/iq80/snappy/SlowMemory.java deleted file mode 100644 index f1f1336..0000000 --- a/src/main/java/org/iq80/snappy/SlowMemory.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.iq80.snappy; - -class SlowMemory - implements Memory -{ - @Override - public boolean fastAccessSupported() - { - return false; - } - - @Override - public int lookupShort(short[] data, int index) - { - return data[index] & 0xFFFF; - } - - @Override - public int loadByte(byte[] data, int index) - { - return data[index] & 0xFF; - } - - @Override - public int loadInt(byte[] data, int index) - { - return (data[index] & 0xff) | - (data[index + 1] & 0xff) << 8 | - (data[index + 2] & 0xff) << 16 | - (data[index + 3] & 0xff) << 24; - } - - @Override - public void copyLong(byte[] src, int srcIndex, byte[] dest, int destIndex) - { - for (int i = 0; i < 8; i++) { - dest[destIndex + i] = src[srcIndex + i]; - } - } - - @Override - public long loadLong(byte[] data, int index) - { - return (data[index] & 0xffL) | - (data[index + 1] & 0xffL) << 8 | - (data[index + 2] & 0xffL) << 16 | - (data[index + 3] & 0xffL) << 24 | - (data[index + 4] & 0xffL) << 32 | - (data[index + 5] & 0xffL) << 40 | - (data[index + 6] & 0xffL) << 48 | - (data[index + 7] & 0xffL) << 56; - } - - @Override - public void copyMemory(byte[] input, int inputIndex, byte[] output, int outputIndex, int length) - { - System.arraycopy(input, inputIndex, output, outputIndex, length); - } -} diff --git a/src/main/java/org/iq80/snappy/Snappy.java b/src/main/java/org/iq80/snappy/Snappy.java index ee071ef..d201f43 100644 --- a/src/main/java/org/iq80/snappy/Snappy.java +++ b/src/main/java/org/iq80/snappy/Snappy.java @@ -17,83 +17,60 @@ */ package org.iq80.snappy; -import java.io.IOException; -import java.io.InputStream; import java.util.Arrays; -import static org.iq80.snappy.SnappyFramed.HEADER_BYTES; -import static org.iq80.snappy.SnappyInternalUtils.checkArgument; -import static org.iq80.snappy.SnappyInternalUtils.checkNotNull; -import static org.iq80.snappy.SnappyOutputStream.STREAM_HEADER; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; +import static sun.misc.Unsafe.ARRAY_BYTE_BASE_OFFSET; public final class Snappy { - - private static final int MAX_HEADER_LENGTH = Math.max(STREAM_HEADER.length, HEADER_BYTES.length); - - private Snappy() - { - } - - /** - * Uses the stream marker bytes to determine if the {@link SnappyFramedInputStream} or - * {@link SnappyInputStream} should be used to decompress the content of source. - * - * @param source The compressed content to decompress. Must {@link InputStream#markSupported() - * support} {@link InputStream#mark(int).} - * @param verifyChecksums Indicates if the crc32-c checksums should be calculated and verified. - * @return An appropriate {@link InputStream} implementation to decompress the content. - * @throws IllegalArgumentException If source does not {@link InputStream#markSupported() - * support} mark/reset or does not contain the appropriate marker bytes for either implementation. - */ - @SuppressWarnings("deprecation") - public static InputStream determineSnappyInputStream(InputStream source, boolean verifyChecksums) - throws IOException - { - checkNotNull(source, "source is null"); - checkArgument(source.markSupported(), "source does not support mark/reset"); - - // read the header and then reset to start of stream - source.mark(MAX_HEADER_LENGTH); - byte[] buffer = new byte[MAX_HEADER_LENGTH]; - int read = SnappyInternalUtils.readBytes(source, buffer, 0, MAX_HEADER_LENGTH); - source.reset(); - - if (read != STREAM_HEADER.length || read != HEADER_BYTES.length) { - throw new IllegalArgumentException("invalid header"); - } - - if (buffer[0] == HEADER_BYTES[0]) { - checkArgument(Arrays.equals(Arrays.copyOf(buffer, HEADER_BYTES.length), HEADER_BYTES), "invalid header"); - return new SnappyFramedInputStream(source, verifyChecksums); - } - else { - checkArgument(Arrays.equals(Arrays.copyOf(buffer, STREAM_HEADER.length), STREAM_HEADER), "invalid header"); - return new SnappyInputStream(source, verifyChecksums); - } - } + private Snappy() {} public static int getUncompressedLength(byte[] compressed, int compressedOffset) throws CorruptionException { - return SnappyDecompressor.getUncompressedLength(compressed, compressedOffset); + long compressedAddress = ARRAY_BYTE_BASE_OFFSET + compressedOffset; + long compressedLimit = ARRAY_BYTE_BASE_OFFSET + compressed.length; + + return SnappyRawDecompressor.getUncompressedLength(compressed, compressedAddress, compressedLimit); } public static byte[] uncompress(byte[] compressed, int compressedOffset, int compressedSize) throws CorruptionException { - return SnappyDecompressor.uncompress(compressed, compressedOffset, compressedSize); + byte[] output = new byte[getUncompressedLength(compressed, compressedOffset)]; + int uncompressedSize = uncompress(compressed, compressedOffset, compressedSize, output, 0); + if (uncompressedSize != output.length) { + throw new CorruptionException(0, format("Recorded length is %s bytes but actual length after decompression is %s bytes ", + output.length, + uncompressedSize)); + } + return output; } public static int uncompress(byte[] compressed, int compressedOffset, int compressedSize, byte[] uncompressed, int uncompressedOffset) throws CorruptionException { - return SnappyDecompressor.uncompress(compressed, compressedOffset, compressedSize, uncompressed, uncompressedOffset); + return uncompress(compressed, compressedOffset, compressedSize, uncompressed, uncompressedOffset, uncompressed.length - uncompressedOffset); + } + + public static int uncompress(byte[] compressed, int compressedOffset, int compressedSize, byte[] uncompressed, int uncompressedOffset, int uncompressedLength) + { + verifyRange(compressed, compressedOffset, compressedSize); + verifyRange(uncompressed, uncompressedOffset, uncompressedLength); + + long inputAddress = ARRAY_BYTE_BASE_OFFSET + compressedOffset; + long inputLimit = inputAddress + compressedSize; + long outputAddress = ARRAY_BYTE_BASE_OFFSET + uncompressedOffset; + long outputLimit = outputAddress + uncompressed.length - uncompressedOffset; + + return SnappyRawDecompressor.decompress(compressed, inputAddress, inputLimit, uncompressed, outputAddress, outputLimit); } public static int maxCompressedLength(int sourceLength) { - return SnappyCompressor.maxCompressedLength(sourceLength); + return SnappyRawCompressor.maxCompressedLength(sourceLength); } public static int compress( @@ -103,14 +80,18 @@ public static int compress( byte[] compressed, int compressedOffset) { - return SnappyCompressor.compress(uncompressed, - uncompressedOffset, - uncompressedLength, - compressed, - compressedOffset); - } + verifyRange(uncompressed, uncompressedOffset, uncompressedLength); + verifyRange(compressed, compressedOffset, compressed.length - compressedOffset); + + long inputAddress = ARRAY_BYTE_BASE_OFFSET + uncompressedOffset; + long inputLimit = inputAddress + uncompressedLength; + long outputAddress = ARRAY_BYTE_BASE_OFFSET + compressedOffset; + long outputLimit = outputAddress + compressed.length - compressedOffset; + short[] table = new short[SnappyRawCompressor.MAX_HASH_TABLE_SIZE]; + return SnappyRawCompressor.compress(uncompressed, inputAddress, inputLimit, compressed, outputAddress, outputLimit, table); + } public static byte[] compress(byte[] data) { byte[] compressedOut = new byte[maxCompressedLength(data.length)]; @@ -119,8 +100,11 @@ public static byte[] compress(byte[] data) return trimmedBuffer; } - static final int LITERAL = 0; - static final int COPY_1_BYTE_OFFSET = 1; // 3 bit length + 3 bits of offset in opcode - static final int COPY_2_BYTE_OFFSET = 2; - static final int COPY_4_BYTE_OFFSET = 3; + private static void verifyRange(byte[] data, int offset, int length) + { + requireNonNull(data, "data is null"); + if (offset < 0 || length < 0 || offset + length > data.length) { + throw new IllegalArgumentException(format("Invalid offset or length (%s, %s) in array of length %s", offset, length, data.length)); + } + } } diff --git a/src/main/java/org/iq80/snappy/SnappyCompressor.java b/src/main/java/org/iq80/snappy/SnappyCompressor.java deleted file mode 100644 index 54ff780..0000000 --- a/src/main/java/org/iq80/snappy/SnappyCompressor.java +++ /dev/null @@ -1,519 +0,0 @@ -/* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.iq80.snappy; - -import java.nio.ByteOrder; -import java.util.Arrays; - -import static org.iq80.snappy.Snappy.COPY_1_BYTE_OFFSET; -import static org.iq80.snappy.Snappy.COPY_2_BYTE_OFFSET; -import static org.iq80.snappy.Snappy.LITERAL; - -final class SnappyCompressor -{ - private static final boolean NATIVE_LITTLE_ENDIAN = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN; - - // *** DO NOT CHANGE THE VALUE OF kBlockSize *** - // - // New Compression code chops up the input into blocks of at most - // the following size. This ensures that back-references in the - // output never cross kBlockSize block boundaries. This can be - // helpful in implementing blocked decompression. However the - // decompression code should not rely on this guarantee since older - // compression code may not obey it. - private static final int BLOCK_LOG = 15; - private static final int BLOCK_SIZE = 1 << BLOCK_LOG; - - private static final int INPUT_MARGIN_BYTES = 15; - - private static final int MAX_HASH_TABLE_BITS = 14; - private static final int MAX_HASH_TABLE_SIZE = 1 << MAX_HASH_TABLE_BITS; - - public static int maxCompressedLength(int sourceLength) - { - // Compressed data can be defined as: - // compressed := item* literal* - // item := literal* copy - // - // The trailing literal sequence has a space blowup of at most 62/60 - // since a literal of length 60 needs one tag byte + one extra byte - // for length information. - // - // Item blowup is trickier to measure. Suppose the "copy" op copies - // 4 bytes of data. Because of a special check in the encoding code, - // we produce a 4-byte copy only if the offset is < 65536. Therefore - // the copy op takes 3 bytes to encode, and this type of item leads - // to at most the 62/60 blowup for representing literals. - // - // Suppose the "copy" op copies 5 bytes of data. If the offset is big - // enough, it will take 5 bytes to encode the copy op. Therefore the - // worst case here is a one-byte literal followed by a five-byte copy. - // I.e., 6 bytes of input turn into 7 bytes of "compressed" data. - // - // This last factor dominates the blowup, so the final estimate is: - return 32 + sourceLength + sourceLength / 6; - } - - public static int compress( - final byte[] uncompressed, - final int uncompressedOffset, - final int uncompressedLength, - final byte[] compressed, - final int compressedOffset) - { - // First write the uncompressed size to the output as a variable length int - int compressedIndex = writeUncompressedLength(compressed, compressedOffset, uncompressedLength); - - int hashTableSize = getHashTableSize(uncompressedLength); - BufferRecycler recycler = BufferRecycler.instance(); - short[] table = recycler.allocEncodingHash(hashTableSize); - - for (int read = 0; read < uncompressedLength; read += BLOCK_SIZE) { - // Get encoding table for compression - Arrays.fill(table, (short) 0); - - compressedIndex = compressFragment( - uncompressed, - uncompressedOffset + read, - Math.min(uncompressedLength - read, BLOCK_SIZE), - compressed, - compressedIndex, - table); - } - - recycler.releaseEncodingHash(table); - - return compressedIndex - compressedOffset; - } - - private static int compressFragment( - final byte[] input, - final int inputOffset, - final int inputSize, - final byte[] output, - int outputIndex, - final short[] table) - { - int ipIndex = inputOffset; - assert inputSize <= BLOCK_SIZE; - final int ipEndIndex = inputOffset + inputSize; - - int hashTableSize = getHashTableSize(inputSize); - // todo given that hashTableSize is required to be a power of 2, this is overly complex - final int shift = 32 - log2Floor(hashTableSize); - assert (hashTableSize & (hashTableSize - 1)) == 0 : "table must be power of two"; - assert 0xFFFFFFFF >>> shift == hashTableSize - 1; - - // Bytes in [nextEmitIndex, ipIndex) will be emitted as literal bytes. Or - // [nextEmitIndex, ipEndIndex) after the main loop. - int nextEmitIndex = ipIndex; - - if (inputSize >= INPUT_MARGIN_BYTES) { - final int ipLimit = inputOffset + inputSize - INPUT_MARGIN_BYTES; - while (ipIndex <= ipLimit) { - assert nextEmitIndex <= ipIndex; - - // The body of this loop calls EmitLiteral once and then EmitCopy one or - // more times. (The exception is that when we're close to exhausting - // the input we exit and emit a literal.) - // - // In the first iteration of this loop we're just starting, so - // there's nothing to copy, so calling EmitLiteral once is - // necessary. And we only start a new iteration when the - // current iteration has determined that a call to EmitLiteral will - // precede the next call to EmitCopy (if any). - // - // Step 1: Scan forward in the input looking for a 4-byte-long match. - // If we get close to exhausting the input exit and emit a final literal. - // - // Heuristic match skipping: If 32 bytes are scanned with no matches - // found, start looking only at every other byte. If 32 more bytes are - // scanned, look at every third byte, etc.. When a match is found, - // immediately go back to looking at every byte. This is a small loss - // (~5% performance, ~0.1% density) for compressible data due to more - // bookkeeping, but for non-compressible data (such as JPEG) it's a huge - // win since the compressor quickly "realizes" the data is incompressible - // and doesn't bother looking for matches everywhere. - // - // The "skip" variable keeps track of how many bytes there are since the - // last match; dividing it by 32 (ie. right-shifting by five) gives the - // number of bytes to move ahead for each iteration. - int skip = 32; - - int[] candidateResult = findCandidate(input, ipIndex, ipLimit, inputOffset, shift, table, skip); - ipIndex = candidateResult[0]; - int candidateIndex = candidateResult[1]; - skip = candidateResult[2]; - if (ipIndex + bytesBetweenHashLookups(skip) > ipLimit) { - break; - } - - // Step 2: A 4-byte match has been found. We'll later see if more - // than 4 bytes match. But, prior to the match, input - // bytes [nextEmit, ip) are unmatched. Emit them as "literal bytes." - assert nextEmitIndex + 16 <= ipEndIndex; - outputIndex = emitLiteral(output, outputIndex, input, nextEmitIndex, ipIndex - nextEmitIndex, true); - - // Step 3: Call EmitCopy, and then see if another EmitCopy could - // be our next move. Repeat until we find no match for the - // input immediately after what was consumed by the last EmitCopy call. - // - // If we exit this loop normally then we need to call EmitLiteral next, - // though we don't yet know how big the literal will be. We handle that - // by proceeding to the next iteration of the main loop. We also can exit - // this loop via goto if we get close to exhausting the input. - int[] indexes = emitCopies(input, inputOffset, inputSize, ipIndex, output, outputIndex, table, shift, candidateIndex); - ipIndex = indexes[0]; - outputIndex = indexes[1]; - nextEmitIndex = ipIndex; - } - } - - // goto emitRemainder hack - if (nextEmitIndex < ipEndIndex) { - // Emit the remaining bytes as a literal - outputIndex = emitLiteral(output, outputIndex, input, nextEmitIndex, ipEndIndex - nextEmitIndex, false); - } - return outputIndex; - } - - private static int[] findCandidate(byte[] input, int ipIndex, int ipLimit, int inputOffset, int shift, short[] table, int skip) - { - - int candidateIndex = 0; - for (ipIndex += 1; ipIndex + bytesBetweenHashLookups(skip) <= ipLimit; ipIndex += bytesBetweenHashLookups(skip++)) { - // hash the 4 bytes starting at the input pointer - int currentInt = SnappyInternalUtils.loadInt(input, ipIndex); - int hash = hashBytes(currentInt, shift); - - // get the position of a 4 bytes sequence with the same hash - candidateIndex = inputOffset + table[hash]; - assert candidateIndex >= 0; - assert candidateIndex < ipIndex; - - // update the hash to point to the current position - table[hash] = (short) (ipIndex - inputOffset); - - // if the 4 byte sequence a the candidate index matches the sequence at the - // current position, proceed to the next phase - if (currentInt == SnappyInternalUtils.loadInt(input, candidateIndex)) { - break; - } - } - return new int[] {ipIndex, candidateIndex, skip}; - } - - private static int bytesBetweenHashLookups(int skip) - { - return (skip >>> 5); - } - - private static int[] emitCopies( - byte[] input, - final int inputOffset, - final int inputSize, - int ipIndex, - byte[] output, - int outputIndex, - short[] table, - int shift, - int candidateIndex) - { - // Step 3: Call EmitCopy, and then see if another EmitCopy could - // be our next move. Repeat until we find no match for the - // input immediately after what was consumed by the last EmitCopy call. - // - // If we exit this loop normally then we need to call EmitLiteral next, - // though we don't yet know how big the literal will be. We handle that - // by proceeding to the next iteration of the main loop. We also can exit - // this loop via goto if we get close to exhausting the input. - int inputBytes; - do { - // We have a 4-byte match at ip, and no need to emit any - // "literal bytes" prior to ip. - int matched = 4 + findMatchLength(input, candidateIndex + 4, input, ipIndex + 4, inputOffset + inputSize); - int offset = ipIndex - candidateIndex; - assert SnappyInternalUtils.equals(input, ipIndex, input, candidateIndex, matched); - ipIndex += matched; - - // emit the copy operation for this chunk - outputIndex = emitCopy(output, outputIndex, offset, matched); - - // are we done? - if (ipIndex >= inputOffset + inputSize - INPUT_MARGIN_BYTES) { - return new int[] {ipIndex, outputIndex}; - } - - // We could immediately start working at ip now, but to improve - // compression we first update table[Hash(ip - 1, ...)]. - int prevInt; - if (SnappyInternalUtils.HAS_UNSAFE) { - long foo = SnappyInternalUtils.loadLong(input, ipIndex - 1); - prevInt = (int) foo; - inputBytes = (int) (foo >>> 8); - } - else { - prevInt = SnappyInternalUtils.loadInt(input, ipIndex - 1); - inputBytes = SnappyInternalUtils.loadInt(input, ipIndex); - } - - // add hash starting with previous byte - int prevHash = hashBytes(prevInt, shift); - table[prevHash] = (short) (ipIndex - inputOffset - 1); - - // update hash of current byte - int curHash = hashBytes(inputBytes, shift); - - candidateIndex = inputOffset + table[curHash]; - table[curHash] = (short) (ipIndex - inputOffset); - - } while (inputBytes == SnappyInternalUtils.loadInt(input, candidateIndex)); - return new int[] {ipIndex, outputIndex}; - } - - private static int emitLiteral( - byte[] output, - int outputIndex, - byte[] literal, - final int literalIndex, - final int length, - final boolean allowFastPath) - { - SnappyInternalUtils.checkPositionIndexes(literalIndex, literalIndex + length, literal.length); - - int n = length - 1; // Zero-length literals are disallowed - if (n < 60) { - // Size fits in tag byte - output[outputIndex++] = (byte) (LITERAL | n << 2); - - // The vast majority of copies are below 16 bytes, for which a - // call to memcpy is overkill. This fast path can sometimes - // copy up to 15 bytes too much, but that is okay in the - // main loop, since we have a bit to go on for both sides: - // - // - The input will always have kInputMarginBytes = 15 extra - // available bytes, as long as we're in the main loop, and - // if not, allowFastPath = false. - // - The output will always have 32 spare bytes (see - // MaxCompressedLength). - if (allowFastPath && length <= 16) { - SnappyInternalUtils.copyLong(literal, literalIndex, output, outputIndex); - SnappyInternalUtils.copyLong(literal, literalIndex + 8, output, outputIndex + 8); - outputIndex += length; - return outputIndex; - } - } - else if (n < (1 << 8)) { - output[outputIndex++] = (byte) (LITERAL | 59 + 1 << 2); - output[outputIndex++] = (byte) (n); - } - else if (n < (1 << 16)) { - output[outputIndex++] = (byte) (LITERAL | 59 + 2 << 2); - output[outputIndex++] = (byte) (n); - output[outputIndex++] = (byte) (n >>> 8); - } - else if (n < (1 << 24)) { - output[outputIndex++] = (byte) (LITERAL | 59 + 3 << 2); - output[outputIndex++] = (byte) (n); - output[outputIndex++] = (byte) (n >>> 8); - output[outputIndex++] = (byte) (n >>> 16); - } - else { - output[outputIndex++] = (byte) (LITERAL | 59 + 4 << 2); - output[outputIndex++] = (byte) (n); - output[outputIndex++] = (byte) (n >>> 8); - output[outputIndex++] = (byte) (n >>> 16); - output[outputIndex++] = (byte) (n >>> 24); - } - - SnappyInternalUtils.checkPositionIndexes(literalIndex, literalIndex + length, literal.length); - - System.arraycopy(literal, literalIndex, output, outputIndex, length); - outputIndex += length; - return outputIndex; - } - - private static int emitCopyLessThan64( - byte[] output, - int outputIndex, - int offset, - int length) - { - assert offset >= 0; - assert length <= 64; - assert length >= 4; - assert offset < 65536; - - if ((length < 12) && (offset < 2048)) { - int lenMinus4 = length - 4; - assert (lenMinus4 < 8); // Must fit in 3 bits - output[outputIndex++] = (byte) (COPY_1_BYTE_OFFSET | ((lenMinus4) << 2) | ((offset >>> 8) << 5)); - output[outputIndex++] = (byte) (offset); - } - else { - output[outputIndex++] = (byte) (COPY_2_BYTE_OFFSET | ((length - 1) << 2)); - output[outputIndex++] = (byte) (offset); - output[outputIndex++] = (byte) (offset >>> 8); - } - return outputIndex; - } - - private static int emitCopy( - byte[] output, - int outputIndex, - int offset, - int length) - { - // Emit 64 byte copies but make sure to keep at least four bytes reserved - while (length >= 68) { - outputIndex = emitCopyLessThan64(output, outputIndex, offset, 64); - length -= 64; - } - - // Emit an extra 60 byte copy if have too much data to fit in one copy - if (length > 64) { - outputIndex = emitCopyLessThan64(output, outputIndex, offset, 60); - length -= 60; - } - - // Emit remainder - outputIndex = emitCopyLessThan64(output, outputIndex, offset, length); - return outputIndex; - } - - private static int findMatchLength( - byte[] s1, - int s1Index, - byte[] s2, - final int s2Index, - int s2Limit) - { - assert (s2Limit >= s2Index); - - if (SnappyInternalUtils.HAS_UNSAFE) { - int matched = 0; - - while (s2Index + matched <= s2Limit - 4 && SnappyInternalUtils.loadInt(s2, s2Index + matched) == SnappyInternalUtils.loadInt(s1, s1Index + matched)) { - matched += 4; - } - - if (NATIVE_LITTLE_ENDIAN && s2Index + matched <= s2Limit - 4) { - int x = SnappyInternalUtils.loadInt(s2, s2Index + matched) ^ SnappyInternalUtils.loadInt(s1, s1Index + matched); - int matchingBits = Integer.numberOfTrailingZeros(x); - matched += matchingBits >> 3; - } - else { - while (s2Index + matched < s2Limit && s1[s1Index + matched] == s2[s2Index + matched]) { - ++matched; - } - } - return matched; - } - else { - int length = s2Limit - s2Index; - for (int matched = 0; matched < length; matched++) { - if (s1[s1Index + matched] != s2[s2Index + matched]) { - return matched; - } - } - return length; - } - } - - private static int getHashTableSize(int inputSize) - { - // Use smaller hash table when input.size() is smaller, since we - // fill the table, incurring O(hash table size) overhead for - // compression, and if the input is short, we won't need that - // many hash table entries anyway. - assert (MAX_HASH_TABLE_SIZE >= 256); - - int hashTableSize = 256; - while (hashTableSize < MAX_HASH_TABLE_SIZE && hashTableSize < inputSize) { - hashTableSize <<= 1; - } - assert 0 == (hashTableSize & (hashTableSize - 1)) : "hash must be power of two"; - assert hashTableSize <= MAX_HASH_TABLE_SIZE : "hash table too large"; - return hashTableSize; - -// // todo should be faster but is not -// int newHashTableSize; -// if (inputSize < 256) { -// newHashTableSize = 256; -// } else if (inputSize > kMaxHashTableSize) { -// newHashTableSize = kMaxHashTableSize; -// } else { -// int leadingZeros = Integer.numberOfLeadingZeros(inputSize - 1); -// newHashTableSize = 1 << (32 - leadingZeros); -// } -// -// assert 0 == (newHashTableSize & (newHashTableSize - 1)) : "hash must be power of two"; -// assert newHashTableSize <= kMaxHashTableSize : "hash table too large"; -// return newHashTableSize; - } - - // Any hash function will produce a valid compressed bitstream, but a good - // hash function reduces the number of collisions and thus yields better - // compression for compressible input, and more speed for incompressible - // input. Of course, it doesn't hurt if the hash function is reasonably fast - // either, as it gets called a lot. - private static int hashBytes(int bytes, int shift) - { - int kMul = 0x1e35a7bd; - return (bytes * kMul) >>> shift; - } - - private static int log2Floor(int n) - { - return n == 0 ? -1 : 31 ^ Integer.numberOfLeadingZeros(n); - } - - /** - * Writes the uncompressed length as variable length integer. - */ - private static int writeUncompressedLength(byte[] compressed, int compressedOffset, int uncompressedLength) - { - int highBitMask = 0x80; - if (uncompressedLength < (1 << 7) && uncompressedLength >= 0) { - compressed[compressedOffset++] = (byte) (uncompressedLength); - } - else if (uncompressedLength < (1 << 14) && uncompressedLength > 0) { - compressed[compressedOffset++] = (byte) (uncompressedLength | highBitMask); - compressed[compressedOffset++] = (byte) (uncompressedLength >>> 7); - } - else if (uncompressedLength < (1 << 21) && uncompressedLength > 0) { - compressed[compressedOffset++] = (byte) (uncompressedLength | highBitMask); - compressed[compressedOffset++] = (byte) ((uncompressedLength >>> 7) | highBitMask); - compressed[compressedOffset++] = (byte) (uncompressedLength >>> 14); - } - else if (uncompressedLength < (1 << 28) && uncompressedLength > 0) { - compressed[compressedOffset++] = (byte) (uncompressedLength | highBitMask); - compressed[compressedOffset++] = (byte) ((uncompressedLength >>> 7) | highBitMask); - compressed[compressedOffset++] = (byte) ((uncompressedLength >>> 14) | highBitMask); - compressed[compressedOffset++] = (byte) (uncompressedLength >>> 21); - } - else { - compressed[compressedOffset++] = (byte) (uncompressedLength | highBitMask); - compressed[compressedOffset++] = (byte) ((uncompressedLength >>> 7) | highBitMask); - compressed[compressedOffset++] = (byte) ((uncompressedLength >>> 14) | highBitMask); - compressed[compressedOffset++] = (byte) ((uncompressedLength >>> 21) | highBitMask); - compressed[compressedOffset++] = (byte) (uncompressedLength >>> 28); - } - return compressedOffset; - } -} diff --git a/src/main/java/org/iq80/snappy/Memory.java b/src/main/java/org/iq80/snappy/SnappyConstants.java similarity index 51% rename from src/main/java/org/iq80/snappy/Memory.java rename to src/main/java/org/iq80/snappy/SnappyConstants.java index 53972d1..838322c 100644 --- a/src/main/java/org/iq80/snappy/Memory.java +++ b/src/main/java/org/iq80/snappy/SnappyConstants.java @@ -1,8 +1,4 @@ /* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at @@ -17,19 +13,15 @@ */ package org.iq80.snappy; -interface Memory +final class SnappyConstants { - boolean fastAccessSupported(); - - int lookupShort(short[] data, int index); - - int loadByte(byte[] data, int index); - - int loadInt(byte[] data, int index); - - void copyLong(byte[] src, int srcIndex, byte[] dest, int destIndex); + static final int SIZE_OF_SHORT = 2; + static final int SIZE_OF_INT = 4; + static final int SIZE_OF_LONG = 8; - long loadLong(byte[] data, int index); + static final int LITERAL = 0; + static final int COPY_1_BYTE_OFFSET = 1; // 3 bit length + 3 bits of offset in opcode + static final int COPY_2_BYTE_OFFSET = 2; - void copyMemory(byte[] input, int inputIndex, byte[] output, int outputIndex, int length); + private SnappyConstants() {} } diff --git a/src/main/java/org/iq80/snappy/SnappyDecompressor.java b/src/main/java/org/iq80/snappy/SnappyDecompressor.java deleted file mode 100644 index 0b1b01b..0000000 --- a/src/main/java/org/iq80/snappy/SnappyDecompressor.java +++ /dev/null @@ -1,434 +0,0 @@ -/* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.iq80.snappy; - -import static org.iq80.snappy.SnappyInternalUtils.copyLong; -import static org.iq80.snappy.SnappyInternalUtils.loadByte; -import static org.iq80.snappy.SnappyInternalUtils.lookupShort; - -final class SnappyDecompressor -{ - private static final int MAX_INCREMENT_COPY_OVERFLOW = 20; - - public static int getUncompressedLength(byte[] compressed, int compressedOffset) - throws CorruptionException - { - return readUncompressedLength(compressed, compressedOffset)[0]; - } - - public static byte[] uncompress(byte[] compressed, int compressedOffset, int compressedSize) - throws CorruptionException - { - // Read the uncompressed length from the front of the compressed input - int[] varInt = readUncompressedLength(compressed, compressedOffset); - int expectedLength = varInt[0]; - compressedOffset += varInt[1]; - compressedSize -= varInt[1]; - - // allocate the uncompressed buffer - byte[] uncompressed = new byte[expectedLength]; - - // Process the entire input - int uncompressedSize = decompressAllTags( - compressed, - compressedOffset, - compressedSize, - uncompressed, - 0); - - if (!(expectedLength == uncompressedSize)) { - throw new CorruptionException(String.format("Recorded length is %s bytes but actual length after decompression is %s bytes ", - expectedLength, - uncompressedSize)); - } - - return uncompressed; - } - - public static int uncompress(byte[] compressed, int compressedOffset, int compressedSize, byte[] uncompressed, int uncompressedOffset) - throws CorruptionException - { - // Read the uncompressed length from the front of the compressed input - int[] varInt = readUncompressedLength(compressed, compressedOffset); - int expectedLength = varInt[0]; - compressedOffset += varInt[1]; - compressedSize -= varInt[1]; - - SnappyInternalUtils.checkArgument(expectedLength <= uncompressed.length - uncompressedOffset, - "Uncompressed length %s must be less than %s", expectedLength, uncompressed.length - uncompressedOffset); - - // Process the entire input - int uncompressedSize = decompressAllTags( - compressed, - compressedOffset, - compressedSize, - uncompressed, - uncompressedOffset); - - if (!(expectedLength == uncompressedSize)) { - throw new CorruptionException(String.format("Recorded length is %s bytes but actual length after decompression is %s bytes ", - expectedLength, - uncompressedSize)); - } - - return expectedLength; - } - - private static int decompressAllTags( - final byte[] input, - final int inputOffset, - final int inputSize, - final byte[] output, - final int outputOffset) - throws CorruptionException - { - final int outputLimit = output.length; - - final int ipLimit = inputOffset + inputSize; - int opIndex = outputOffset; - int ipIndex = inputOffset; - - while (ipIndex < ipLimit - 5) { - int opCode = loadByte(input, ipIndex++); - int entry = lookupShort(opLookupTable, opCode); - int trailerBytes = entry >>> 11; - int trailer = readTrailer(input, ipIndex, trailerBytes); - - // advance the ipIndex past the op codes - ipIndex += entry >>> 11; - int length = entry & 0xff; - - if ((opCode & 0x3) == Snappy.LITERAL) { - int literalLength = length + trailer; - copyLiteral(input, ipIndex, output, opIndex, literalLength); - ipIndex += literalLength; - opIndex += literalLength; - } - else { - // copyOffset/256 is encoded in bits 8..10. By just fetching - // those bits, we get copyOffset (since the bit-field starts at - // bit 8). - int copyOffset = entry & 0x700; - copyOffset += trailer; - - // inline to force hot-spot to keep inline - // - // Equivalent to incrementalCopy (below) except that it can write up to ten extra - // bytes after the end of the copy, and that it is faster. - // - // The main part of this loop is a simple copy of eight bytes at a time until - // we've copied (at least) the requested amount of bytes. However, if op and - // src are less than eight bytes apart (indicating a repeating pattern of - // length < 8), we first need to expand the pattern in order to get the correct - // results. For instance, if the buffer looks like this, with the eight-byte - // and patterns marked as intervals: - // - // abxxxxxxxxxxxx - // [------] src - // [------] op - // - // a single eight-byte copy from to will repeat the pattern once, - // after which we can move two bytes without moving : - // - // ababxxxxxxxxxx - // [------] src - // [------] op - // - // and repeat the exercise until the two no longer overlap. - // - // This allows us to do very well in the special case of one single byte - // repeated many times, without taking a big hit for more general cases. - // - // The worst case of extra writing past the end of the match occurs when - // op - src == 1 and len == 1; the last copy will read from byte positions - // [0..7] and write to [4..11], whereas it was only supposed to write to - // position 1. Thus, ten excess bytes. - { - int spaceLeft = outputLimit - opIndex; - int srcIndex = opIndex - copyOffset; - if (srcIndex < outputOffset) { - throw new CorruptionException("Invalid copy offset for opcode starting at " + (ipIndex - trailerBytes - 1)); - } - - if (length <= 16 && copyOffset >= 8 && spaceLeft >= 16) { - // Fast path, used for the majority (70-80%) of dynamic invocations. - copyLong(output, srcIndex, output, opIndex); - copyLong(output, srcIndex + 8, output, opIndex + 8); - } - else if (spaceLeft >= length + MAX_INCREMENT_COPY_OVERFLOW) { - incrementalCopyFastPath(output, srcIndex, opIndex, length); - } - else { - incrementalCopy(output, srcIndex, output, opIndex, length); - } - } - opIndex += length; - } - } - - - for (; ipIndex < ipLimit; ) { - int[] result = decompressTagSlow(input, ipIndex, output, outputLimit, outputOffset, opIndex); - ipIndex = result[0]; - opIndex = result[1]; - } - - return opIndex - outputOffset; - } - - /** - * This is a second copy of the inner loop of decompressTags used when near the end - * of the input. The key difference is the reading of the trailer bytes. The fast - * code does a blind read of the next 4 bytes as an int, and this code assembles - * the int byte-by-byte to assure that the array is not over run. The reason this - * code path is separate is the if condition to choose between these two seemingly - * small differences costs like 10-20% of the throughput. I'm hoping in future - * versions of hot-spot this code can be integrated into the main loop but for now - * it is worth the extra maintenance pain to get the extra 10-20%. - */ - private static int[] decompressTagSlow(byte[] input, int ipIndex, byte[] output, int outputLimit, int outputOffset, int opIndex) - throws CorruptionException - { - // read the op code - int opCode = loadByte(input, ipIndex++); - int entry = lookupShort(opLookupTable, opCode); - int trailerBytes = entry >>> 11; - // - // Key difference here - // - int trailer = 0; - switch (trailerBytes) { - case 4: - trailer = (input[ipIndex + 3] & 0xff) << 24; - case 3: - trailer |= (input[ipIndex + 2] & 0xff) << 16; - case 2: - trailer |= (input[ipIndex + 1] & 0xff) << 8; - case 1: - trailer |= (input[ipIndex] & 0xff); - } - - // advance the ipIndex past the op codes - ipIndex += trailerBytes; - int length = entry & 0xff; - - if ((opCode & 0x3) == Snappy.LITERAL) { - int literalLength = length + trailer; - copyLiteral(input, ipIndex, output, opIndex, literalLength); - ipIndex += literalLength; - opIndex += literalLength; - } - else { - // copyOffset/256 is encoded in bits 8..10. By just fetching - // those bits, we get copyOffset (since the bit-field starts at - // bit 8). - int copyOffset = entry & 0x700; - copyOffset += trailer; - - // inline to force hot-spot to keep inline - { - int spaceLeft = outputLimit - opIndex; - int srcIndex = opIndex - copyOffset; - - if (srcIndex < outputOffset) { - throw new CorruptionException("Invalid copy offset for opcode starting at " + (ipIndex - trailerBytes - 1)); - } - - if (length <= 16 && copyOffset >= 8 && spaceLeft >= 16) { - // Fast path, used for the majority (70-80%) of dynamic invocations. - copyLong(output, srcIndex, output, opIndex); - copyLong(output, srcIndex + 8, output, opIndex + 8); - } - else if (spaceLeft >= length + MAX_INCREMENT_COPY_OVERFLOW) { - incrementalCopyFastPath(output, srcIndex, opIndex, length); - } - else { - incrementalCopy(output, srcIndex, output, opIndex, length); - } - } - opIndex += length; - } - return new int[] {ipIndex, opIndex}; - } - - private static int readTrailer(byte[] data, int index, int bytes) - { - return SnappyInternalUtils.loadInt(data, index) & wordmask[bytes]; - } - - private static void copyLiteral(byte[] input, int ipIndex, byte[] output, int opIndex, int length) - throws CorruptionException - { - assert length > 0; - assert ipIndex >= 0; - assert opIndex >= 0; - - int spaceLeft = output.length - opIndex; - int readableBytes = input.length - ipIndex; - - if (readableBytes < length || spaceLeft < length) { - throw new CorruptionException("Corrupt literal length"); - } - - if (length <= 16 && spaceLeft >= 16 && readableBytes >= 16) { - copyLong(input, ipIndex, output, opIndex); - copyLong(input, ipIndex + 8, output, opIndex + 8); - } - else { - int fastLength = length & 0xFFFFFFF8; - if (fastLength <= 64) { - // copy long-by-long - for (int i = 0; i < fastLength; i += 8) { - copyLong(input, ipIndex + i, output, opIndex + i); - } - - // copy byte-by-byte - int slowLength = length & 0x7; - // NOTE: This is not a manual array copy. We are copying an overlapping region - // and we want input data to repeat as it is recopied. see incrementalCopy below. - //noinspection ManualArrayCopy - for (int i = 0; i < slowLength; i += 1) { - output[opIndex + fastLength + i] = input[ipIndex + fastLength + i]; - } - } - else { - SnappyInternalUtils.copyMemory(input, ipIndex, output, opIndex, length); - } - } - } - - /** - * Copy "len" bytes from "src" to "op", one byte at a time. Used for - * handling COPY operations where the input and output regions may - * overlap. For example, suppose: - * src == "ab" - * op == src + 2 - * len == 20 - *

- * After incrementalCopy, the result will have - * eleven copies of "ab" - * ababababababababababab - * Note that this does not match the semantics of either memcpy() - * or memmove(). - */ - private static void incrementalCopy(byte[] src, int srcIndex, byte[] op, int opIndex, int length) - { - do { - op[opIndex++] = src[srcIndex++]; - } while (--length > 0); - } - - private static void incrementalCopyFastPath(byte[] output, int srcIndex, int opIndex, int length) - { - int copiedLength = 0; - while ((opIndex + copiedLength) - srcIndex < 8) { - copyLong(output, srcIndex, output, opIndex + copiedLength); - copiedLength += (opIndex + copiedLength) - srcIndex; - } - - for (int i = 0; i < length - copiedLength; i += 8) { - copyLong(output, srcIndex + i, output, opIndex + copiedLength + i); - } - } - - // Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits - private static final int[] wordmask = new int[] { - 0, 0xff, 0xffff, 0xffffff, 0xffffffff - }; - - // Data stored per entry in lookup table: - // Range Bits-used Description - // ------------------------------------ - // 1..64 0..7 Literal/copy length encoded in opcode byte - // 0..7 8..10 Copy offset encoded in opcode byte / 256 - // 0..4 11..13 Extra bytes after opcode - // - // We use eight bits for the length even though 7 would have sufficed - // because of efficiency reasons: - // (1) Extracting a byte is faster than a bit-field - // (2) It properly aligns copy offset so we do not need a <<8 - private static final short[] opLookupTable = new short[] { - 0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002, - 0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004, - 0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006, - 0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008, - 0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a, - 0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c, - 0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e, - 0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010, - 0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012, - 0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014, - 0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016, - 0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018, - 0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a, - 0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c, - 0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e, - 0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020, - 0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022, - 0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024, - 0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026, - 0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028, - 0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a, - 0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c, - 0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e, - 0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030, - 0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032, - 0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034, - 0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036, - 0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038, - 0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a, - 0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c, - 0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e, - 0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040 - }; - - /** - * Reads the variable length integer encoded a the specified offset, and - * returns this length with the number of bytes read. - */ - private static int[] readUncompressedLength(byte[] compressed, int compressedOffset) - throws CorruptionException - { - int result; - int bytesRead = 0; - { - int b = compressed[compressedOffset + bytesRead++] & 0xFF; - result = b & 0x7f; - if ((b & 0x80) != 0) { - b = compressed[compressedOffset + bytesRead++] & 0xFF; - result |= (b & 0x7f) << 7; - if ((b & 0x80) != 0) { - b = compressed[compressedOffset + bytesRead++] & 0xFF; - result |= (b & 0x7f) << 14; - if ((b & 0x80) != 0) { - b = compressed[compressedOffset + bytesRead++] & 0xFF; - result |= (b & 0x7f) << 21; - if ((b & 0x80) != 0) { - b = compressed[compressedOffset + bytesRead++] & 0xFF; - result |= (b & 0x7f) << 28; - if ((b & 0x80) != 0) { - throw new CorruptionException("last byte of compressed length int has high bit set"); - } - } - } - } - } - } - return new int[] {result, bytesRead}; - } -} diff --git a/src/main/java/org/iq80/snappy/SnappyFramedInputStream.java b/src/main/java/org/iq80/snappy/SnappyFramedInputStream.java index a3de8c5..467b21c 100644 --- a/src/main/java/org/iq80/snappy/SnappyFramedInputStream.java +++ b/src/main/java/org/iq80/snappy/SnappyFramedInputStream.java @@ -17,29 +17,221 @@ */ package org.iq80.snappy; + +import java.io.EOFException; import java.io.IOException; import java.io.InputStream; +import java.util.Arrays; -import static org.iq80.snappy.SnappyFramed.COMPRESSED_DATA_FLAG; -import static org.iq80.snappy.SnappyFramed.HEADER_BYTES; -import static org.iq80.snappy.SnappyFramed.STREAM_IDENTIFIER_FLAG; -import static org.iq80.snappy.SnappyFramed.UNCOMPRESSED_DATA_FLAG; import static org.iq80.snappy.SnappyFramedOutputStream.MAX_BLOCK_SIZE; +import static java.lang.Math.min; /** * Implements the x-snappy-framed as an {@link InputStream}. */ -public class SnappyFramedInputStream - extends AbstractSnappyInputStream +public final class SnappyFramedInputStream + extends InputStream { + private final InputStream in; + private final byte[] frameHeader; + private final boolean verifyChecksums; + + /** + * A single frame read from the underlying {@link InputStream}. + */ + private byte[] input = new byte[0]; + /** + * The decompressed data from {@link #input}. + */ + private byte[] uncompressed = new byte[0]; + /** + * Indicates if this instance has been closed. + */ + private boolean closed; + /** + * Indicates if we have reached the EOF on {@link #in}. + */ + private boolean eof; + /** + * The position in {@link #input} to read to. + */ + private int valid; + /** + * The next position to read from {@link #buffer}. + */ + private int position; + /** + * Buffer is a reference to the real buffer of uncompressed data for the + * current block: uncompressed if the block is compressed, or input if it is + * not. + */ + private byte[] buffer; + + public SnappyFramedInputStream(InputStream in) + throws IOException + { + this(in, true); + } + public SnappyFramedInputStream(InputStream in, boolean verifyChecksums) throws IOException { - super(in, MAX_BLOCK_SIZE, 4, verifyChecksums, HEADER_BYTES); + this.in = in; + this.verifyChecksums = verifyChecksums; + allocateBuffersBasedOnSize(MAX_BLOCK_SIZE + 5); + this.frameHeader = new byte[4]; + + // stream must begin with stream header + byte[] actualHeader = new byte[SnappyFramed.HEADER_BYTES.length]; + + int read = SnappyInternalUtils.readBytes(in, actualHeader, 0, actualHeader.length); + if (read < SnappyFramed.HEADER_BYTES.length) { + throw new EOFException("encountered EOF while reading stream header"); + } + if (!Arrays.equals(SnappyFramed.HEADER_BYTES, actualHeader)) { + throw new IOException("invalid stream header"); + } + } + + @Override + public int read() + throws IOException + { + if (closed) { + return -1; + } + if (!ensureBuffer()) { + return -1; + } + return buffer[position++] & 0xFF; + } + + @Override + public int read(byte[] output, int offset, int length) + throws IOException + { + SnappyInternalUtils.checkNotNull(output, "output is null"); + SnappyInternalUtils.checkPositionIndexes(offset, offset + length, output.length); + if (closed) { + throw new IOException("Stream is closed"); + } + + if (length == 0) { + return 0; + } + if (!ensureBuffer()) { + return -1; + } + + int size = min(length, available()); + System.arraycopy(buffer, position, output, offset, size); + position += size; + return size; + } + + @Override + public int available() + throws IOException + { + if (closed) { + return 0; + } + return valid - position; } @Override - protected FrameMetaData getFrameMetaData(byte[] frameHeader) + public void close() + throws IOException + { + try { + in.close(); + } + finally { + if (!closed) { + closed = true; + } + } + } + + private boolean ensureBuffer() + throws IOException + { + if (available() > 0) { + return true; + } + if (eof) { + return false; + } + + if (!readBlockHeader()) { + eof = true; + return false; + } + + // get action based on header + FrameMetaData frameMetaData = getFrameMetaData(frameHeader); + + if (FrameAction.SKIP == frameMetaData.frameAction) { + SnappyInternalUtils.skip(in, frameMetaData.length); + return ensureBuffer(); + } + + if (frameMetaData.length > input.length) { + allocateBuffersBasedOnSize(frameMetaData.length); + } + + int actualRead = SnappyInternalUtils.readBytes(in, input, 0, frameMetaData.length); + if (actualRead != frameMetaData.length) { + throw new EOFException("unexpected EOF when reading frame"); + } + + FrameData frameData = getFrameData(input); + + if (FrameAction.UNCOMPRESS == frameMetaData.frameAction) { + int uncompressedLength = Snappy.getUncompressedLength(input, frameData.offset); + + if (uncompressedLength > uncompressed.length) { + uncompressed = new byte[uncompressedLength]; + } + + this.valid = Snappy.uncompress(input, frameData.offset, actualRead - frameData.offset, uncompressed, 0); + this.buffer = uncompressed; + this.position = 0; + } + else { + // we need to start reading at the offset + this.position = frameData.offset; + this.buffer = input; + // valid is until the end of the read data, regardless of offset + // indicating where we start + this.valid = actualRead; + } + + if (verifyChecksums) { + int actualCrc32c = Crc32C.maskedCrc32c(buffer, position, valid - position); + if (frameData.checkSum != actualCrc32c) { + throw new IOException("Corrupt input: invalid checksum"); + } + } + + return true; + } + + private void allocateBuffersBasedOnSize(int size) + { + if (input.length < size) { + input = new byte[size]; + } + if (uncompressed.length < size) { + uncompressed = new byte[size]; + } + } + + /** + * Use the content of the frameHeader to describe what type of frame we have + * and the action to take. + */ + private static FrameMetaData getFrameMetaData(byte[] frameHeader) throws IOException { int length = (frameHeader[1] & 0xFF); @@ -50,15 +242,15 @@ protected FrameMetaData getFrameMetaData(byte[] frameHeader) FrameAction frameAction; int flag = frameHeader[0] & 0xFF; switch (flag) { - case COMPRESSED_DATA_FLAG: + case SnappyFramed.COMPRESSED_DATA_FLAG: frameAction = FrameAction.UNCOMPRESS; minLength = 5; break; - case UNCOMPRESSED_DATA_FLAG: + case SnappyFramed.UNCOMPRESSED_DATA_FLAG: frameAction = FrameAction.RAW; minLength = 5; break; - case STREAM_IDENTIFIER_FLAG: + case SnappyFramed.STREAM_IDENTIFIER_FLAG: if (length != 6) { throw new IOException("stream identifier chunk with invalid length: " + length); } @@ -83,8 +275,13 @@ protected FrameMetaData getFrameMetaData(byte[] frameHeader) return new FrameMetaData(frameAction, length); } - @Override - protected FrameData getFrameData(byte[] frameHeader, byte[] content, int length) + /** + * Extract frame data + * + * @param content The content of the frame. Content begins at index {@code 0}. + * @return Metadata about the content of the frame. + */ + private static FrameData getFrameData(byte[] content) { // crc is contained in the frame content int crc32c = (content[3] & 0xFF) << 24 | @@ -94,4 +291,53 @@ protected FrameData getFrameData(byte[] frameHeader, byte[] content, int length) return new FrameData(crc32c, 4); } + + private boolean readBlockHeader() + throws IOException + { + int read = SnappyInternalUtils.readBytes(in, frameHeader, 0, frameHeader.length); + + if (read == -1) { + return false; + } + + if (read < frameHeader.length) { + throw new EOFException("encountered EOF while reading block header"); + } + + return true; + } + + private enum FrameAction + { + RAW, SKIP, UNCOMPRESS + } + + private static final class FrameMetaData + { + final int length; + final FrameAction frameAction; + + /** + * @param frameAction + * @param length + */ + public FrameMetaData(FrameAction frameAction, int length) + { + this.frameAction = frameAction; + this.length = length; + } + } + + private static final class FrameData + { + final int checkSum; + final int offset; + + public FrameData(int checkSum, int offset) + { + this.checkSum = checkSum; + this.offset = offset; + } + } } diff --git a/src/main/java/org/iq80/snappy/SnappyFramedOutputStream.java b/src/main/java/org/iq80/snappy/SnappyFramedOutputStream.java index e625ea7..fa18f40 100644 --- a/src/main/java/org/iq80/snappy/SnappyFramedOutputStream.java +++ b/src/main/java/org/iq80/snappy/SnappyFramedOutputStream.java @@ -20,16 +20,11 @@ import java.io.IOException; import java.io.OutputStream; -import static org.iq80.snappy.SnappyFramed.COMPRESSED_DATA_FLAG; -import static org.iq80.snappy.SnappyFramed.HEADER_BYTES; -import static org.iq80.snappy.SnappyFramed.UNCOMPRESSED_DATA_FLAG; -import static org.iq80.snappy.SnappyInternalUtils.checkArgument; - /** * Implements the x-snappy-framed as an {@link OutputStream}. */ public final class SnappyFramedOutputStream - extends AbstractSnappyOutputStream + extends OutputStream { /** * We place an additional restriction that the uncompressed data in @@ -41,42 +36,217 @@ public final class SnappyFramedOutputStream public static final int DEFAULT_BLOCK_SIZE = MAX_BLOCK_SIZE; public static final double DEFAULT_MIN_COMPRESSION_RATIO = 0.85d; + private final int blockSize; + private final byte[] buffer; + private final byte[] outputBuffer; + private final double minCompressionRatio; + private final OutputStream out; + private final boolean writeChecksums; + + private int position; + private boolean closed; + /** + * Creates a Snappy output stream to write data to the specified underlying output stream. + * + * @param out the underlying output stream + */ public SnappyFramedOutputStream(OutputStream out) throws IOException { - this(out, DEFAULT_BLOCK_SIZE, DEFAULT_MIN_COMPRESSION_RATIO); + this(out, true); + } + + /** + * Creates a Snappy output stream to write data to the specified underlying output stream. + * + * @param out the underlying output stream + */ + public SnappyFramedOutputStream(OutputStream out, int blockSize, double minCompressionRatio) + throws IOException + { + this(out, true, blockSize, minCompressionRatio); + } + + /** + * Creates a Snappy output stream with block checksums disabled. This is only useful for + * apples-to-apples benchmarks with other compressors that do not perform block checksums. + * + * @param out the underlying output stream + */ + public static SnappyFramedOutputStream newChecksumFreeBenchmarkOutputStream(OutputStream out) + throws IOException + { + return new SnappyFramedOutputStream(out, false); + } + + private SnappyFramedOutputStream(OutputStream out, boolean writeChecksums) + throws IOException + { + this(out, writeChecksums, DEFAULT_BLOCK_SIZE, DEFAULT_MIN_COMPRESSION_RATIO); } - public SnappyFramedOutputStream(OutputStream out, int blockSize, - double minCompressionRatio) + private SnappyFramedOutputStream(OutputStream out, boolean writeChecksums, int blockSize, double minCompressionRatio) throws IOException { - super(out, blockSize, minCompressionRatio); - checkArgument(blockSize > 0 && blockSize <= MAX_BLOCK_SIZE, "blockSize must be in (0, 65536]", blockSize); + this.out = SnappyInternalUtils.checkNotNull(out, "out is null"); + this.writeChecksums = writeChecksums; + SnappyInternalUtils.checkArgument(minCompressionRatio > 0 && minCompressionRatio <= 1.0, "minCompressionRatio %1s must be between (0,1.0].", minCompressionRatio); + this.minCompressionRatio = minCompressionRatio; + this.blockSize = blockSize; + this.buffer = new byte[blockSize]; + this.outputBuffer = new byte[Snappy.maxCompressedLength(blockSize)]; + + out.write(SnappyFramed.HEADER_BYTES); + SnappyInternalUtils.checkArgument(blockSize > 0 && blockSize <= MAX_BLOCK_SIZE, "blockSize must be in (0, 65536]", blockSize); } @Override - protected void writeHeader(OutputStream out) + public void write(int b) throws IOException { - out.write(HEADER_BYTES); + if (closed) { + throw new IOException("Stream is closed"); + } + if (position >= blockSize) { + flushBuffer(); + } + buffer[position++] = (byte) b; + } + + @Override + public void write(byte[] input, int offset, int length) + throws IOException + { + SnappyInternalUtils.checkNotNull(input, "input is null"); + SnappyInternalUtils.checkPositionIndexes(offset, offset + length, input.length); + if (closed) { + throw new IOException("Stream is closed"); + } + + int free = blockSize - position; + + // easy case: enough free space in buffer for entire input + if (free >= length) { + copyToBuffer(input, offset, length); + return; + } + + // fill partial buffer as much as possible and flush + if (position > 0) { + copyToBuffer(input, offset, free); + flushBuffer(); + offset += free; + length -= free; + } + + // write remaining full blocks directly from input array + while (length >= blockSize) { + writeCompressed(input, offset, blockSize); + offset += blockSize; + length -= blockSize; + } + + // copy remaining partial block into now-empty buffer + copyToBuffer(input, offset, length); + } + + @Override + public void flush() + throws IOException + { + if (closed) { + throw new IOException("Stream is closed"); + } + flushBuffer(); + out.flush(); + } + + @Override + public void close() + throws IOException + { + if (closed) { + return; + } + try { + flush(); + out.close(); + } + finally { + closed = true; + } + } + + private void copyToBuffer(byte[] input, int offset, int length) + { + System.arraycopy(input, offset, buffer, position, length); + position += length; } /** - * Each chunk consists first a single byte of chunk identifier, then a - * three-byte little-endian length of the chunk in bytes (from 0 to - * 16777215, inclusive), and then the data if any. The four bytes of chunk - * header is not counted in the data length. + * Compresses and writes out any buffered data. This does nothing if there + * is no currently buffered data. */ - @Override - protected void writeBlock(OutputStream out, byte[] data, int offset, int length, boolean compressed, int crc32c) + private void flushBuffer() + throws IOException + { + if (position > 0) { + writeCompressed(buffer, 0, position); + position = 0; + } + } + + /** + * {@link Crc32C#maskedCrc32c(byte[], int, int) Calculates} the crc, compresses + * the data, determines if the compression ratio is acceptable and calls + * {@link #writeBlock(OutputStream, byte[], int, int, boolean, int)} to + * actually write the frame. + * + * @param input The byte[] containing the raw data to be compressed. + * @param offset The offset into input where the data starts. + * @param length The amount of data in input. + */ + private void writeCompressed(byte[] input, int offset, int length) + throws IOException + { + // crc is based on the user supplied input data + int crc32c = writeChecksums ? Crc32C.maskedCrc32c(input, offset, length) : 0; + + int compressed = Snappy.compress(input, + offset, + length, + outputBuffer, + 0); + + // only use the compressed data if compression ratio is <= the minCompressionRatio + if (((double) compressed / (double) length) <= minCompressionRatio) { + writeBlock(out, outputBuffer, 0, compressed, true, crc32c); + } + else { + // otherwise use the uncompressed data. + writeBlock(out, input, offset, length, false, crc32c); + } + } + + /** + * Write a frame (block) to out. + * + * @param out The {@link OutputStream} to write to. + * @param data The data to write. + * @param offset The offset in data to start at. + * @param length The length of data to use. + * @param compressed Indicates if data is the compressed or raw content. + * This is based on whether the compression ratio desired is + * reached. + * @param crc32c The calculated checksum. + */ + private static void writeBlock(OutputStream out, byte[] data, int offset, int length, boolean compressed, int crc32c) throws IOException { - out.write(compressed ? COMPRESSED_DATA_FLAG : UNCOMPRESSED_DATA_FLAG); + out.write(compressed ? SnappyFramed.COMPRESSED_DATA_FLAG : SnappyFramed.UNCOMPRESSED_DATA_FLAG); - // the length written out to the header is both the checksum and the - // frame + // the length written out to the header is both the checksum and the frame int headerLength = length + 4; // write length diff --git a/src/main/java/org/iq80/snappy/SnappyInputStream.java b/src/main/java/org/iq80/snappy/SnappyInputStream.java deleted file mode 100644 index 19afb92..0000000 --- a/src/main/java/org/iq80/snappy/SnappyInputStream.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.iq80.snappy; - -import java.io.IOException; -import java.io.InputStream; -import java.util.Arrays; - -import static java.lang.String.format; -import static org.iq80.snappy.SnappyOutputStream.MAX_BLOCK_SIZE; -import static org.iq80.snappy.SnappyOutputStream.STREAM_HEADER; - -/** - * This class implements an input stream for reading Snappy compressed data - * of the format produced by {@link SnappyOutputStream}. - *

- * NOTE:This implementation cannot read compressed data produced - * by {@link SnappyFramedOutputStream}. - *

- * - * @deprecated Prefer the use of {@link SnappyFramedInputStream} which implements - * the standard {@code x-snappy-framed} specification. - */ -@Deprecated -public class SnappyInputStream - extends AbstractSnappyInputStream -{ - private static final int HEADER_LENGTH = 7; - - /** - * Creates a Snappy input stream to read data from the specified underlying input stream. - * - * @param in the underlying input stream - */ - public SnappyInputStream(InputStream in) - throws IOException - { - this(in, true); - } - - /** - * Creates a Snappy input stream to read data from the specified underlying input stream. - * - * @param in the underlying input stream - * @param verifyChecksums if true, checksums in input stream will be verified - */ - public SnappyInputStream(InputStream in, boolean verifyChecksums) - throws IOException - { - super(in, MAX_BLOCK_SIZE, HEADER_LENGTH, verifyChecksums, STREAM_HEADER); - } - - @Override - protected FrameMetaData getFrameMetaData(byte[] frameHeader) - throws IOException - { - int x = frameHeader[0] & 0xFF; - - int a = frameHeader[1] & 0xFF; - int b = frameHeader[2] & 0xFF; - int length = (a << 8) | b; - - FrameAction action; - switch (x) { - case 0x00: - action = FrameAction.RAW; - break; - case 0x01: - action = FrameAction.UNCOMPRESS; - break; - case 's': - if (!Arrays.equals(STREAM_HEADER, frameHeader)) { - throw new IOException(format("invalid compressed flag in header: 0x%02x", x)); - } - action = FrameAction.SKIP; - length = 0; - break; - default: - throw new IOException(format("invalid compressed flag in header: 0x%02x", x)); - } - - if (((length <= 0) || (length > MAX_BLOCK_SIZE)) && action != FrameAction.SKIP) { - throw new IOException("invalid block size in header: " + length); - } - - return new FrameMetaData(action, length); - } - - @Override - protected FrameData getFrameData(byte[] frameHeader, byte[] content, int length) - { - // crc is contained in the frame header - int crc32c = (frameHeader[3] & 0xFF) << 24 | - (frameHeader[4] & 0xFF) << 16 | - (frameHeader[5] & 0xFF) << 8 | - (frameHeader[6] & 0xFF); - - return new FrameData(crc32c, 0); - } -} diff --git a/src/main/java/org/iq80/snappy/SnappyInternalUtils.java b/src/main/java/org/iq80/snappy/SnappyInternalUtils.java index b2b00c6..3249194 100644 --- a/src/main/java/org/iq80/snappy/SnappyInternalUtils.java +++ b/src/main/java/org/iq80/snappy/SnappyInternalUtils.java @@ -19,100 +19,10 @@ import java.io.IOException; import java.io.InputStream; -import java.nio.ByteOrder; final class SnappyInternalUtils { - private SnappyInternalUtils() - { - } - - private static final Memory memory; - - static { - // Try to only load one implementation of Memory to assure the call sites are monomorphic (fast) - Memory memoryInstance = null; - - // TODO enable UnsafeMemory on big endian machines - // - // The current UnsafeMemory code assumes the machine is little endian, and will - // not work correctly on big endian CPUs. For now, we will disable UnsafeMemory on - // big endian machines. This will make the code significantly slower on big endian. - // In the future someone should add the necessary flip bytes calls to make this - // work efficiently on big endian machines. - if (ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN) { - try { - Class unsafeMemoryClass = SnappyInternalUtils.class.getClassLoader().loadClass("org.iq80.snappy.UnsafeMemory").asSubclass(Memory.class); - Memory unsafeMemory = unsafeMemoryClass.newInstance(); - if (unsafeMemory.loadInt(new byte[4], 0) == 0) { - memoryInstance = unsafeMemory; - } - } - catch (Throwable ignored) { - } - } - if (memoryInstance == null) { - try { - Class slowMemoryClass = SnappyInternalUtils.class.getClassLoader().loadClass("org.iq80.snappy.SlowMemory").asSubclass(Memory.class); - Memory slowMemory = slowMemoryClass.newInstance(); - if (slowMemory.loadInt(new byte[4], 0) == 0) { - memoryInstance = slowMemory; - } - else { - throw new AssertionError("SlowMemory class is broken!"); - } - } - catch (Throwable ignored) { - throw new AssertionError("Could not find SlowMemory class"); - } - } - memory = memoryInstance; - } - - static final boolean HAS_UNSAFE = memory.fastAccessSupported(); - - static boolean equals(byte[] left, int leftIndex, byte[] right, int rightIndex, int length) - { - checkPositionIndexes(leftIndex, leftIndex + length, left.length); - checkPositionIndexes(rightIndex, rightIndex + length, right.length); - - for (int i = 0; i < length; i++) { - if (left[leftIndex + i] != right[rightIndex + i]) { - return false; - } - } - return true; - } - - public static int lookupShort(short[] data, int index) - { - return memory.lookupShort(data, index); - } - - public static int loadByte(byte[] data, int index) - { - return memory.loadByte(data, index); - } - - static int loadInt(byte[] data, int index) - { - return memory.loadInt(data, index); - } - - static void copyLong(byte[] src, int srcIndex, byte[] dest, int destIndex) - { - memory.copyLong(src, srcIndex, dest, destIndex); - } - - static long loadLong(byte[] data, int index) - { - return memory.loadLong(data, index); - } - - static void copyMemory(byte[] input, int inputIndex, byte[] output, int outputIndex, int length) - { - memory.copyMemory(input, inputIndex, output, outputIndex, length); - } + private SnappyInternalUtils() {} // // Copied from Guava Preconditions @@ -140,7 +50,7 @@ static void checkPositionIndexes(int start, int end, int size) } } - static String badPositionIndexes(int start, int end, int size) + private static String badPositionIndexes(int start, int end, int size) { if (start < 0 || start > size) { return badPositionIndex(start, size, "start index"); @@ -152,7 +62,7 @@ static String badPositionIndexes(int start, int end, int size) return String.format("end index (%s) must not be less than start index (%s)", end, start); } - static String badPositionIndex(int index, int size, String desc) + private static String badPositionIndex(int index, int size, String desc) { if (index < 0) { return String.format("%s (%s) must not be negative", desc, index); diff --git a/src/main/java/org/iq80/snappy/SnappyOutputStream.java b/src/main/java/org/iq80/snappy/SnappyOutputStream.java deleted file mode 100644 index d6b3afc..0000000 --- a/src/main/java/org/iq80/snappy/SnappyOutputStream.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.iq80.snappy; - -import java.io.IOException; -import java.io.OutputStream; - -/** - * This class implements an output stream for writing Snappy compressed data. - * The output format is the stream header "snappy\0" followed by one or more - * compressed blocks of data, each of which is preceded by a seven byte header. - *

- * The first byte of the header is a flag indicating if the block is compressed - * or not. A value of 0x00 means uncompressed, and 0x01 means compressed. - *

- * The second and third bytes are the size of the block in the stream as a big - * endian number. This value is never zero as empty blocks are never written. - * The maximum allowed length is 32k (1 << 15). - *

- * The remaining four byes are crc32c checksum of the user input data masked - * with the following function: {@code ((crc >>> 15) | (crc << 17)) + 0xa282ead8 } - *

- * An uncompressed block is simply copied from the input, thus guaranteeing - * that the output is never larger than the input (not including the header). - *

- * NOTE:This data produced by this class is not compatible with the - * {@code x-snappy-framed} specification. It can only be read by - * {@link SnappyInputStream}. - *

- * - * @deprecated Use {@link SnappyFramedOutputStream} which implements - * the standard {@code x-snappy-framed} specification. - */ -@Deprecated -public class SnappyOutputStream - extends AbstractSnappyOutputStream -{ - static final byte[] STREAM_HEADER = new byte[] {'s', 'n', 'a', 'p', 'p', 'y', 0}; - - // the header format requires the max block size to fit in 15 bits -- do not change! - static final int MAX_BLOCK_SIZE = 1 << 15; - - /** - * Write out the uncompressed content if the compression ratio (compressed length / raw length) exceeds this value. - */ - public static final double MIN_COMPRESSION_RATIO = 7.0 / 8.0; - - private final boolean calculateChecksum; - - /** - * Creates a Snappy output stream to write data to the specified underlying output stream. - * - * @param out the underlying output stream - */ - public SnappyOutputStream(OutputStream out) - throws IOException - { - this(out, true); - } - - private SnappyOutputStream(OutputStream out, boolean calculateChecksum) - throws IOException - { - super(out, MAX_BLOCK_SIZE, MIN_COMPRESSION_RATIO); - this.calculateChecksum = calculateChecksum; - } - - /** - * Creates a Snappy output stream with block checksums disabled. This is only useful for - * apples-to-apples benchmarks with other compressors that do not perform block checksums. - * - * @param out the underlying output stream - */ - public static SnappyOutputStream newChecksumFreeBenchmarkOutputStream(OutputStream out) - throws IOException - { - return new SnappyOutputStream(out, false); - } - - @Override - protected void writeHeader(OutputStream out) - throws IOException - { - out.write(STREAM_HEADER); - } - - @Override - protected int calculateCRC32C(byte[] data, int offset, int length) - { - return calculateChecksum ? super.calculateCRC32C(data, offset, length) : 0; - } - - @Override - protected void writeBlock(OutputStream out, byte[] data, int offset, int length, boolean compressed, int crc32c) - throws IOException - { - // write compressed flag - out.write(compressed ? 0x01 : 0x00); - - // write length - out.write(length >>> 8); - out.write(length); - - // write crc32c of user input data - out.write(crc32c >>> 24); - out.write(crc32c >>> 16); - out.write(crc32c >>> 8); - out.write(crc32c); - - // write data - out.write(data, offset, length); - } -} diff --git a/src/main/java/org/iq80/snappy/SnappyRawCompressor.java b/src/main/java/org/iq80/snappy/SnappyRawCompressor.java new file mode 100644 index 0000000..b9ee582 --- /dev/null +++ b/src/main/java/org/iq80/snappy/SnappyRawCompressor.java @@ -0,0 +1,411 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.iq80.snappy; + +import java.util.Arrays; + +import static org.iq80.snappy.SnappyConstants.COPY_1_BYTE_OFFSET; +import static org.iq80.snappy.SnappyConstants.COPY_2_BYTE_OFFSET; +import static org.iq80.snappy.SnappyConstants.SIZE_OF_INT; +import static org.iq80.snappy.SnappyConstants.SIZE_OF_LONG; +import static org.iq80.snappy.SnappyConstants.SIZE_OF_SHORT; +import static org.iq80.snappy.UnsafeUtil.UNSAFE; + +final class SnappyRawCompressor +{ + // The size of a compression block. Note that many parts of the compression + // code assumes that BLOCK_SIZE <= 65536; in particular, the hash table + // can only store 16-bit offsets, and EmitCopy() also assumes the offset + // is 65535 bytes or less. Note also that if you change this, it will + // affect the framing format (see framing_format.txt). + // + // Note that there might be older data around that is compressed with larger + // block sizes, so the decompression code should not rely on the + // non-existence of long back-references. + private static final int BLOCK_LOG = 16; + private static final int BLOCK_SIZE = 1 << BLOCK_LOG; + + private static final int INPUT_MARGIN_BYTES = 15; + + private static final int MAX_HASH_TABLE_BITS = 14; + public static final int MAX_HASH_TABLE_SIZE = 1 << MAX_HASH_TABLE_BITS; + + private SnappyRawCompressor() {} + + public static int maxCompressedLength(int sourceLength) + { + // Compressed data can be defined as: + // compressed := item* literal* + // item := literal* copy + // + // The trailing literal sequence has a space blowup of at most 62/60 + // since a literal of length 60 needs one tag byte + one extra byte + // for length information. + // + // Item blowup is trickier to measure. Suppose the "copy" op copies + // 4 bytes of data. Because of a special check in the encoding code, + // we produce a 4-byte copy only if the offset is < 65536. Therefore + // the copy op takes 3 bytes to encode, and this type of item leads + // to at most the 62/60 blowup for representing literals. + // + // Suppose the "copy" op copies 5 bytes of data. If the offset is big + // enough, it will take 5 bytes to encode the copy op. Therefore the + // worst case here is a one-byte literal followed by a five-byte copy. + // I.e., 6 bytes of input turn into 7 bytes of "compressed" data. + // + // This last factor dominates the blowup, so the final estimate is: + return 32 + sourceLength + sourceLength / 6; + } + + // suppress warnings is required to use assert + @SuppressWarnings("IllegalToken") + public static int compress( + final Object inputBase, + final long inputAddress, + final long inputLimit, + final Object outputBase, + final long outputAddress, + final long outputLimit, + final short[] table) + { + // The compression code assumes output is larger than the max compression size (with 32 bytes of + // extra padding), and does not check bounds for writing to output. + int maxCompressedLength = maxCompressedLength((int) (inputLimit - inputAddress)); + if (outputLimit - outputAddress < maxCompressedLength) { + throw new IllegalArgumentException("Output buffer must be at least " + maxCompressedLength + " bytes"); + } + + // First write the uncompressed size to the output as a variable length int + long output = writeUncompressedLength(outputBase, outputAddress, (int) (inputLimit - inputAddress)); + + for (long blockAddress = inputAddress; blockAddress < inputLimit; blockAddress += BLOCK_SIZE) { + final long blockLimit = Math.min(inputLimit, blockAddress + BLOCK_SIZE); + long input = blockAddress; + assert blockLimit - blockAddress <= BLOCK_SIZE; + + int blockHashTableSize = getHashTableSize((int) (blockLimit - blockAddress)); + Arrays.fill(table, 0, blockHashTableSize, (short) 0); + + // todo given that hashTableSize is required to be a power of 2, this is overly complex + final int shift = 32 - log2Floor(blockHashTableSize); + assert (blockHashTableSize & (blockHashTableSize - 1)) == 0 : "table must be power of two"; + assert 0xFFFFFFFF >>> shift == blockHashTableSize - 1; + + // Bytes in [nextEmitAddress, input) will be emitted as literal bytes. Or + // [nextEmitAddress, inputLimit) after the main loop. + long nextEmitAddress = input; + + final long fastInputLimit = blockLimit - INPUT_MARGIN_BYTES; + while (input <= fastInputLimit) { + assert nextEmitAddress <= input; + + // The body of this loop emits a literal once and then emits a copy one + // or more times. (The exception is that when we're close to exhausting + // the input we exit and emit a literal.) + // + // In the first iteration of this loop we're just starting, so + // there's nothing to copy, so we must emit a literal once. And we + // only start a new iteration when the current iteration has determined + // that a literal will precede the next copy (if any). + // + // Step 1: Scan forward in the input looking for a 4-byte-long match. + // If we get close to exhausting the input exit and emit a final literal. + // + // Heuristic match skipping: If 32 bytes are scanned with no matches + // found, start looking only at every other byte. If 32 more bytes are + // scanned, look at every third byte, etc.. When a match is found, + // immediately go back to looking at every byte. This is a small loss + // (~5% performance, ~0.1% density) for compressible data due to more + // bookkeeping, but for non-compressible data (such as JPEG) it's a huge + // win since the compressor quickly "realizes" the data is incompressible + // and doesn't bother looking for matches everywhere. + // + // The "skip" variable keeps track of how many bytes there are since the + // last match; dividing it by 32 (ie. right-shifting by five) gives the + // number of bytes to move ahead for each iteration. + int skip = 32; + + long candidateIndex = 0; + for (input += 1; input + (skip >>> 5) <= fastInputLimit; input += ((skip++) >>> 5)) { + // hash the 4 bytes starting at the input pointer + int currentInt = UNSAFE.getInt(inputBase, input); + int hash = hashBytes(currentInt, shift); + + // get the position of a 4 bytes sequence with the same hash + candidateIndex = blockAddress + (table[hash] & 0xFFFF); + assert candidateIndex >= 0; + assert candidateIndex < input; + + // update the hash to point to the current position + table[hash] = (short) (input - blockAddress); + + // if the 4 byte sequence a the candidate index matches the sequence at the + // current position, proceed to the next phase + if (currentInt == UNSAFE.getInt(inputBase, candidateIndex)) { + break; + } + } + if (input + (skip >>> 5) > fastInputLimit) { + break; + } + + // Step 2: A 4-byte match has been found. We'll later see if more + // than 4 bytes match. But, prior to the match, input + // bytes [nextEmit, ip) are unmatched. Emit them as "literal bytes." + assert nextEmitAddress + 16 <= blockLimit; + + int literalLength = (int) (input - nextEmitAddress); + output = emitLiteralLength(outputBase, output, literalLength); + + // Fast copy can use 8 extra bytes of input and output, which is safe because: + // - The input will always have INPUT_MARGIN_BYTES = 15 extra available bytes + // - The output will always have 32 spare bytes (see MaxCompressedLength). + output = fastCopy(inputBase, nextEmitAddress, outputBase, output, literalLength); + + // Step 3: Call EmitCopy, and then see if another EmitCopy could + // be our next move. Repeat until we find no match for the + // input immediately after what was consumed by the last EmitCopy call. + // + // If we exit this loop normally then we need to call EmitLiteral next, + // though we don't yet know how big the literal will be. We handle that + // by proceeding to the next iteration of the main loop. We also can exit + // this loop via goto if we get close to exhausting the input. + int inputBytes; + do { + // We have a 4-byte match at input, and no need to emit any + // "literal bytes" prior to input. + assert (blockLimit >= input + SIZE_OF_INT); + + // determine match length + int matched = count(inputBase, input + SIZE_OF_INT, candidateIndex + SIZE_OF_INT, blockLimit); + matched += SIZE_OF_INT; + + // Emit the copy operation for this chunk + output = emitCopy(outputBase, output, input, candidateIndex, matched); + input += matched; + + // are we done? + if (input >= fastInputLimit) { + break; + } + + // We could immediately start working at input now, but to improve + // compression we first update table[Hash(ip - 1, ...)]. + long longValue = UNSAFE.getLong(inputBase, input - 1); + int prevInt = (int) longValue; + inputBytes = (int) (longValue >>> 8); + + // add hash starting with previous byte + int prevHash = hashBytes(prevInt, shift); + table[prevHash] = (short) (input - blockAddress - 1); + + // update hash of current byte + int curHash = hashBytes(inputBytes, shift); + + candidateIndex = blockAddress + (table[curHash] & 0xFFFF); + table[curHash] = (short) (input - blockAddress); + } while (inputBytes == UNSAFE.getInt(inputBase, candidateIndex)); + nextEmitAddress = input; + } + + // Emit the remaining bytes as a literal + if (nextEmitAddress < blockLimit) { + int literalLength = (int) (blockLimit - nextEmitAddress); + output = emitLiteralLength(outputBase, output, literalLength); + UNSAFE.copyMemory(inputBase, nextEmitAddress, outputBase, output, literalLength); + output += literalLength; + } + } + + return (int) (output - outputAddress); + } + + private static int count(Object inputBase, final long start, long matchStart, long matchLimit) + { + long current = start; + + // first, compare long at a time + while (current < matchLimit - (SIZE_OF_LONG - 1)) { + long diff = UNSAFE.getLong(inputBase, matchStart) ^ UNSAFE.getLong(inputBase, current); + if (diff != 0) { + current += Long.numberOfTrailingZeros(diff) >> 3; + return (int) (current - start); + } + + current += SIZE_OF_LONG; + matchStart += SIZE_OF_LONG; + } + + if (current < matchLimit - (SIZE_OF_INT - 1) && UNSAFE.getInt(inputBase, matchStart) == UNSAFE.getInt(inputBase, current)) { + current += SIZE_OF_INT; + matchStart += SIZE_OF_INT; + } + + if (current < matchLimit - (SIZE_OF_SHORT - 1) && UNSAFE.getShort(inputBase, matchStart) == UNSAFE.getShort(inputBase, current)) { + current += SIZE_OF_SHORT; + matchStart += SIZE_OF_SHORT; + } + + if (current < matchLimit && UNSAFE.getByte(inputBase, matchStart) == UNSAFE.getByte(inputBase, current)) { + ++current; + } + + return (int) (current - start); + } + + private static long emitLiteralLength(Object outputBase, long output, int literalLength) + { + int n = literalLength - 1; // Zero-length literals are disallowed + if (n < 60) { + // Size fits in tag byte + UNSAFE.putByte(outputBase, output++, (byte) (n << 2)); + } + else { + int bytes; + if (n < (1 << 8)) { + UNSAFE.putByte(outputBase, output++, (byte) (59 + 1 << 2)); + bytes = 1; + } + else if (n < (1 << 16)) { + UNSAFE.putByte(outputBase, output++, (byte) (59 + 2 << 2)); + bytes = 2; + } + else if (n < (1 << 24)) { + UNSAFE.putByte(outputBase, output++, (byte) (59 + 3 << 2)); + bytes = 3; + } + else { + UNSAFE.putByte(outputBase, output++, (byte) (59 + 4 << 2)); + bytes = 4; + } + // System is assumed to be little endian, so low bytes will be zero for the smaller numbers + UNSAFE.putInt(outputBase, output, n); + output += bytes; + } + return output; + } + + private static long fastCopy(final Object inputBase, long input, final Object outputBase, long output, final int literalLength) + { + final long outputLimit = output + literalLength; + do { + UNSAFE.putLong(outputBase, output, UNSAFE.getLong(inputBase, input)); + input += SIZE_OF_LONG; + output += SIZE_OF_LONG; + } + while (output < outputLimit); + return outputLimit; + } + + private static long emitCopy(Object outputBase, long output, long input, long matchIndex, int matchLength) + { + long offset = input - matchIndex; + + // Emit 64 byte copies but make sure to keep at least four bytes reserved + while (matchLength >= 68) { + UNSAFE.putByte(outputBase, output++, (byte) (COPY_2_BYTE_OFFSET + ((64 - 1) << 2))); + UNSAFE.putShort(outputBase, output, (short) offset); + output += SIZE_OF_SHORT; + matchLength -= 64; + } + + // Emit an extra 60 byte copy if have too much data to fit in one copy + // length < 68 + if (matchLength > 64) { + UNSAFE.putByte(outputBase, output++, (byte) (COPY_2_BYTE_OFFSET + ((60 - 1) << 2))); + UNSAFE.putShort(outputBase, output, (short) offset); + output += SIZE_OF_SHORT; + matchLength -= 60; + } + + // Emit remainder + if ((matchLength < 12) && (offset < 2048)) { + int lenMinus4 = matchLength - 4; + UNSAFE.putByte(outputBase, output++, (byte) (COPY_1_BYTE_OFFSET + ((lenMinus4) << 2) + ((offset >>> 8) << 5))); + UNSAFE.putByte(outputBase, output++, (byte) (offset)); + } + else { + UNSAFE.putByte(outputBase, output++, (byte) (COPY_2_BYTE_OFFSET + ((matchLength - 1) << 2))); + UNSAFE.putShort(outputBase, output, (short) offset); + output += SIZE_OF_SHORT; + } + return output; + } + + @SuppressWarnings("IllegalToken") + private static int getHashTableSize(int inputSize) + { + // Use smaller hash table when input.size() is smaller, since we + // fill the table, incurring O(hash table size) overhead for + // compression, and if the input is short, we won't need that + // many hash table entries anyway. + assert (MAX_HASH_TABLE_SIZE >= 256); + + // smallest power of 2 larger than inputSize + int target = Integer.highestOneBit(inputSize - 1) << 1; + + // keep it between MIN_TABLE_SIZE and MAX_TABLE_SIZE + return Math.max(Math.min(target, MAX_HASH_TABLE_SIZE), 256); + } + + // Any hash function will produce a valid compressed stream, but a good + // hash function reduces the number of collisions and thus yields better + // compression for compressible input, and more speed for incompressible + // input. Of course, it doesn't hurt if the hash function is reasonably fast + // either, as it gets called a lot. + private static int hashBytes(int value, int shift) + { + return (value * 0x1e35a7bd) >>> shift; + } + + private static int log2Floor(int n) + { + return n == 0 ? -1 : 31 ^ Integer.numberOfLeadingZeros(n); + } + + private static final int HIGH_BIT_MASK = 0x80; + + /** + * Writes the uncompressed length as variable length integer. + */ + private static long writeUncompressedLength(Object outputBase, long outputAddress, int uncompressedLength) + { + if (uncompressedLength < (1 << 7) && uncompressedLength >= 0) { + UNSAFE.putByte(outputBase, outputAddress++, (byte) (uncompressedLength)); + } + else if (uncompressedLength < (1 << 14) && uncompressedLength > 0) { + UNSAFE.putByte(outputBase, outputAddress++, (byte) (uncompressedLength | HIGH_BIT_MASK)); + UNSAFE.putByte(outputBase, outputAddress++, (byte) (uncompressedLength >>> 7)); + } + else if (uncompressedLength < (1 << 21) && uncompressedLength > 0) { + UNSAFE.putByte(outputBase, outputAddress++, (byte) (uncompressedLength | HIGH_BIT_MASK)); + UNSAFE.putByte(outputBase, outputAddress++, (byte) ((uncompressedLength >>> 7) | HIGH_BIT_MASK)); + UNSAFE.putByte(outputBase, outputAddress++, (byte) (uncompressedLength >>> 14)); + } + else if (uncompressedLength < (1 << 28) && uncompressedLength > 0) { + UNSAFE.putByte(outputBase, outputAddress++, (byte) (uncompressedLength | HIGH_BIT_MASK)); + UNSAFE.putByte(outputBase, outputAddress++, (byte) ((uncompressedLength >>> 7) | HIGH_BIT_MASK)); + UNSAFE.putByte(outputBase, outputAddress++, (byte) ((uncompressedLength >>> 14) | HIGH_BIT_MASK)); + UNSAFE.putByte(outputBase, outputAddress++, (byte) (uncompressedLength >>> 21)); + } + else { + UNSAFE.putByte(outputBase, outputAddress++, (byte) (uncompressedLength | HIGH_BIT_MASK)); + UNSAFE.putByte(outputBase, outputAddress++, (byte) ((uncompressedLength >>> 7) | HIGH_BIT_MASK)); + UNSAFE.putByte(outputBase, outputAddress++, (byte) ((uncompressedLength >>> 14) | HIGH_BIT_MASK)); + UNSAFE.putByte(outputBase, outputAddress++, (byte) ((uncompressedLength >>> 21) | HIGH_BIT_MASK)); + UNSAFE.putByte(outputBase, outputAddress++, (byte) (uncompressedLength >>> 28)); + } + return outputAddress; + } +} diff --git a/src/main/java/org/iq80/snappy/SnappyRawDecompressor.java b/src/main/java/org/iq80/snappy/SnappyRawDecompressor.java new file mode 100644 index 0000000..4769170 --- /dev/null +++ b/src/main/java/org/iq80/snappy/SnappyRawDecompressor.java @@ -0,0 +1,320 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.iq80.snappy; + +import static org.iq80.snappy.SnappyConstants.LITERAL; +import static org.iq80.snappy.SnappyConstants.SIZE_OF_INT; +import static org.iq80.snappy.SnappyConstants.SIZE_OF_LONG; +import static org.iq80.snappy.UnsafeUtil.UNSAFE; + +final class SnappyRawDecompressor +{ + private static final int[] DEC_32_TABLE = {4, 1, 2, 1, 4, 4, 4, 4}; + private static final int[] DEC_64_TABLE = {0, 0, 0, -1, 0, 1, 2, 3}; + + private SnappyRawDecompressor() {} + + public static int getUncompressedLength(Object compressed, long compressedAddress, long compressedLimit) + { + return readUncompressedLength(compressed, compressedAddress, compressedLimit)[0]; + } + + public static int decompress( + final Object inputBase, + final long inputAddress, + final long inputLimit, + final Object outputBase, + final long outputAddress, + final long outputLimit) + { + // Read the uncompressed length from the front of the input + long input = inputAddress; + int[] varInt = readUncompressedLength(inputBase, input, inputLimit); + int expectedLength = varInt[0]; + input += varInt[1]; + + SnappyInternalUtils.checkArgument(expectedLength <= (outputLimit - outputAddress), + "Uncompressed length %s must be less than %s", expectedLength, (outputLimit - outputAddress)); + + // Process the entire input + int uncompressedSize = uncompressAll( + inputBase, + input, + inputLimit, + outputBase, + outputAddress, + outputLimit); + + if (!(expectedLength == uncompressedSize)) { + throw new CorruptionException(0, String.format("Recorded length is %s bytes but actual length after decompression is %s bytes ", + expectedLength, + uncompressedSize)); + } + + return expectedLength; + } + + private static int uncompressAll( + final Object inputBase, + final long inputAddress, + final long inputLimit, + final Object outputBase, + final long outputAddress, + final long outputLimit) + { + final long fastOutputLimit = outputLimit - SIZE_OF_LONG; // maximum offset in output buffer to which it's safe to write long-at-a-time + + long output = outputAddress; + long input = inputAddress; + + while (input < inputLimit) { + int opCode = UNSAFE.getByte(inputBase, input++) & 0xFF; + int entry = opLookupTable[opCode] & 0xFFFF; + + int trailerBytes = entry >>> 11; + int trailer = 0; + if (input + SIZE_OF_INT < inputLimit) { + trailer = UNSAFE.getInt(inputBase, input) & wordmask[trailerBytes]; + } + else { + if (input + trailerBytes > inputLimit) { + throw new CorruptionException(input - inputAddress); + } + switch (trailerBytes) { + case 4: + trailer = (UNSAFE.getByte(inputBase, input + 3) & 0xff) << 24; + case 3: + trailer |= (UNSAFE.getByte(inputBase, input + 2) & 0xff) << 16; + case 2: + trailer |= (UNSAFE.getByte(inputBase, input + 1) & 0xff) << 8; + case 1: + trailer |= (UNSAFE.getByte(inputBase, input) & 0xff); + } + } + if (trailer < 0) { + throw new CorruptionException(input - inputAddress); + } + input += trailerBytes; + + int length = entry & 0xff; + if (length == 0) { + continue; + } + + if ((opCode & 0x3) == LITERAL) { + int literalLength = length + trailer; + if (literalLength < 0) { + throw new CorruptionException(input - inputAddress); + } + + // copy literal + long literalOutputLimit = output + literalLength; + if (literalOutputLimit > fastOutputLimit || input + literalLength > inputLimit - SIZE_OF_LONG) { + if (literalOutputLimit > outputLimit || input + literalLength > inputLimit) { + throw new CorruptionException(input - inputAddress); + } + + // slow, precise copy + UNSAFE.copyMemory(inputBase, input, outputBase, output, literalLength); + input += literalLength; + output += literalLength; + } + else { + // fast copy. We may over-copy but there's enough room in input and output to not overrun them + do { + UNSAFE.putLong(outputBase, output, UNSAFE.getLong(inputBase, input)); + input += SIZE_OF_LONG; + output += SIZE_OF_LONG; + } + while (output < literalOutputLimit); + input -= (output - literalOutputLimit); // adjust index if we over-copied + output = literalOutputLimit; + } + } + else { + // matchOffset/256 is encoded in bits 8..10. By just fetching + // those bits, we get matchOffset (since the bit-field starts at + // bit 8). + int matchOffset = entry & 0x700; + matchOffset += trailer; + if (matchOffset < 0) { + throw new CorruptionException(input - inputAddress); + } + + long matchAddress = output - matchOffset; + if (matchAddress < outputAddress || output + length > outputLimit) { + throw new CorruptionException(input - inputAddress); + } + long matchOutputLimit = output + length; + if (matchOutputLimit > outputLimit) { + throw new CorruptionException(input - inputAddress); + } + + if (output > fastOutputLimit) { + // slow match copy + while (output < matchOutputLimit) { + UNSAFE.putByte(outputBase, output++, UNSAFE.getByte(outputBase, matchAddress++)); + } + } + else { + // copy repeated sequence + if (matchOffset < SIZE_OF_LONG) { + // 8 bytes apart so that we can copy long-at-a-time below + int increment32 = DEC_32_TABLE[matchOffset]; + int decrement64 = DEC_64_TABLE[matchOffset]; + + UNSAFE.putByte(outputBase, output, UNSAFE.getByte(outputBase, matchAddress)); + UNSAFE.putByte(outputBase, output + 1, UNSAFE.getByte(outputBase, matchAddress + 1)); + UNSAFE.putByte(outputBase, output + 2, UNSAFE.getByte(outputBase, matchAddress + 2)); + UNSAFE.putByte(outputBase, output + 3, UNSAFE.getByte(outputBase, matchAddress + 3)); + output += SIZE_OF_INT; + matchAddress += increment32; + + UNSAFE.putInt(outputBase, output, UNSAFE.getInt(outputBase, matchAddress)); + output += SIZE_OF_INT; + matchAddress -= decrement64; + } + else { + UNSAFE.putLong(outputBase, output, UNSAFE.getLong(outputBase, matchAddress)); + matchAddress += SIZE_OF_LONG; + output += SIZE_OF_LONG; + } + + if (matchOutputLimit > fastOutputLimit) { + while (output < fastOutputLimit) { + UNSAFE.putLong(outputBase, output, UNSAFE.getLong(outputBase, matchAddress)); + matchAddress += SIZE_OF_LONG; + output += SIZE_OF_LONG; + } + + while (output < matchOutputLimit) { + UNSAFE.putByte(outputBase, output++, UNSAFE.getByte(outputBase, matchAddress++)); + } + } + else { + while (output < matchOutputLimit) { + UNSAFE.putLong(outputBase, output, UNSAFE.getLong(outputBase, matchAddress)); + matchAddress += SIZE_OF_LONG; + output += SIZE_OF_LONG; + } + } + } + output = matchOutputLimit; // correction in case we over-copied + } + } + + return (int) (output - outputAddress); + } + + // Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits + private static final int[] wordmask = new int[] { + 0, 0xff, 0xffff, 0xffffff, 0xffffffff + }; + + // Data stored per entry in lookup table: + // Range Bits-used Description + // ------------------------------------ + // 1..64 0..7 Literal/copy length encoded in opcode byte + // 0..7 8..10 Copy offset encoded in opcode byte / 256 + // 0..4 11..13 Extra bytes after opcode + // + // We use eight bits for the length even though 7 would have sufficed + // because of efficiency reasons: + // (1) Extracting a byte is faster than a bit-field + // (2) It properly aligns copy offset so we do not need a <<8 + private static final short[] opLookupTable = new short[] { + 0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002, + 0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004, + 0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006, + 0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008, + 0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a, + 0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c, + 0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e, + 0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010, + 0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012, + 0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014, + 0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016, + 0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018, + 0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a, + 0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c, + 0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e, + 0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020, + 0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022, + 0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024, + 0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026, + 0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028, + 0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a, + 0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c, + 0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e, + 0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030, + 0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032, + 0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034, + 0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036, + 0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038, + 0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a, + 0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c, + 0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e, + 0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040 + }; + + /** + * Reads the variable length integer encoded a the specified offset, and + * returns this length with the number of bytes read. + */ + private static int[] readUncompressedLength(Object compressed, long compressedAddress, long compressedLimit) + { + int result; + int bytesRead = 0; + { + int b = getUnsignedByteSafe(compressed, compressedAddress + bytesRead, compressedLimit); + bytesRead++; + result = b & 0x7f; + if ((b & 0x80) != 0) { + b = getUnsignedByteSafe(compressed, compressedAddress + bytesRead, compressedLimit); + bytesRead++; + result |= (b & 0x7f) << 7; + if ((b & 0x80) != 0) { + b = getUnsignedByteSafe(compressed, compressedAddress + bytesRead, compressedLimit); + bytesRead++; + result |= (b & 0x7f) << 14; + if ((b & 0x80) != 0) { + b = getUnsignedByteSafe(compressed, compressedAddress + bytesRead, compressedLimit); + bytesRead++; + result |= (b & 0x7f) << 21; + if ((b & 0x80) != 0) { + b = getUnsignedByteSafe(compressed, compressedAddress + bytesRead, compressedLimit); + bytesRead++; + result |= (b & 0x7f) << 28; + if ((b & 0x80) != 0) { + throw new CorruptionException(compressedAddress + bytesRead, "last byte of compressed length int has high bit set"); + } + } + } + } + } + } + if (result < 0) { + throw new CorruptionException(compressedAddress, "negative compressed length"); + } + return new int[] {result, bytesRead}; + } + + private static int getUnsignedByteSafe(Object base, long address, long limit) + { + if (address >= limit) { + throw new CorruptionException(limit - address, "Input is truncated"); + } + return UNSAFE.getByte(base, address) & 0xFF; + } +} diff --git a/src/main/java/org/iq80/snappy/UnsafeMemory.java b/src/main/java/org/iq80/snappy/UnsafeMemory.java deleted file mode 100644 index 478ca97..0000000 --- a/src/main/java/org/iq80/snappy/UnsafeMemory.java +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.iq80.snappy; - -import sun.misc.Unsafe; - -import java.lang.reflect.Field; - -class UnsafeMemory - implements Memory -{ - private static final Unsafe unsafe; - - static { - try { - Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe"); - theUnsafe.setAccessible(true); - unsafe = (Unsafe) theUnsafe.get(null); - // It seems not all Unsafe implementations implement the following method. - new UnsafeMemory().copyMemory(new byte[1], 0, new byte[1], 0, 1); - } - catch (Exception e) { - throw new RuntimeException(e); - } - } - - private static final long BYTE_ARRAY_OFFSET = unsafe.arrayBaseOffset(byte[].class); - private static final long SHORT_ARRAY_OFFSET = unsafe.arrayBaseOffset(short[].class); - private static final long SHORT_ARRAY_STRIDE = unsafe.arrayIndexScale(short[].class); - - @Override - public boolean fastAccessSupported() - { - return true; - } - - @Override - public int lookupShort(short[] data, int index) - { - assert index >= 0; - assert index <= data.length; - return unsafe.getShort(data, SHORT_ARRAY_OFFSET + (index * SHORT_ARRAY_STRIDE)) & 0xFFFF; - } - - @Override - public int loadByte(byte[] data, int index) - { - assert index >= 0; - assert index <= data.length; - return unsafe.getByte(data, BYTE_ARRAY_OFFSET + index) & 0xFF; - } - - @Override - public int loadInt(byte[] data, int index) - { - assert index >= 0; - assert index + 4 <= data.length; - return unsafe.getInt(data, BYTE_ARRAY_OFFSET + index); - } - - @Override - public void copyLong(byte[] src, int srcIndex, byte[] dest, int destIndex) - { - assert srcIndex >= 0; - assert srcIndex + 8 <= src.length; - assert destIndex >= 0; - assert destIndex + 8 <= dest.length; - long value = unsafe.getLong(src, BYTE_ARRAY_OFFSET + srcIndex); - unsafe.putLong(dest, (BYTE_ARRAY_OFFSET + destIndex), value); - } - - @Override - public long loadLong(byte[] data, int index) - { - assert index > 0; - assert index + 4 < data.length; - return unsafe.getLong(data, BYTE_ARRAY_OFFSET + index); - } - - @Override - public void copyMemory(byte[] input, int inputIndex, byte[] output, int outputIndex, int length) - { - assert inputIndex >= 0; - assert inputIndex + length <= input.length; - assert outputIndex >= 0; - assert outputIndex + length <= output.length; - unsafe.copyMemory(input, BYTE_ARRAY_OFFSET + inputIndex, output, BYTE_ARRAY_OFFSET + outputIndex, length); - } -} diff --git a/src/main/java/org/iq80/snappy/UnsafeUtil.java b/src/main/java/org/iq80/snappy/UnsafeUtil.java new file mode 100644 index 0000000..f102c01 --- /dev/null +++ b/src/main/java/org/iq80/snappy/UnsafeUtil.java @@ -0,0 +1,44 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.iq80.snappy; + +import sun.misc.Unsafe; + +import java.lang.reflect.Field; +import java.nio.ByteOrder; + +import static java.lang.String.format; + +final class UnsafeUtil +{ + public static final Unsafe UNSAFE; + + private UnsafeUtil() {} + + static { + ByteOrder order = ByteOrder.nativeOrder(); + if (!order.equals(ByteOrder.LITTLE_ENDIAN)) { + throw new IncompatibleJvmException(format("Snappy requires a little endian platform (found %s)", order)); + } + + try { + Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe"); + theUnsafe.setAccessible(true); + UNSAFE = (Unsafe) theUnsafe.get(null); + } + catch (Exception e) { + throw new IncompatibleJvmException("Snappy requires access to sun.misc.Unsafe"); + } + } +} \ No newline at end of file diff --git a/src/test/java/org/iq80/snappy/BenchmarkDriver.java b/src/test/java/org/iq80/snappy/BenchmarkDriver.java index a4c0034..5d7f2a5 100644 --- a/src/test/java/org/iq80/snappy/BenchmarkDriver.java +++ b/src/test/java/org/iq80/snappy/BenchmarkDriver.java @@ -225,7 +225,7 @@ public long compress(TestData testData, long iterations) long start = System.nanoTime(); while (iterations-- > 0) { rawOut.reset(); - SnappyOutputStream out = SnappyOutputStream.newChecksumFreeBenchmarkOutputStream(rawOut); + SnappyFramedOutputStream out = SnappyFramedOutputStream.newChecksumFreeBenchmarkOutputStream(rawOut); out.write(contents); out.close(); } @@ -248,7 +248,7 @@ public long uncompress(TestData testData, long iterations) byte[] contents = testData.getContents(); ByteArrayOutputStream compressedStream = new ByteArrayOutputStream(Snappy.maxCompressedLength(contents.length)); - SnappyOutputStream out = SnappyOutputStream.newChecksumFreeBenchmarkOutputStream(compressedStream); + SnappyFramedOutputStream out = SnappyFramedOutputStream.newChecksumFreeBenchmarkOutputStream(compressedStream); out.write(contents); out.close(); byte[] compressed = compressedStream.toByteArray(); @@ -259,7 +259,7 @@ public long uncompress(TestData testData, long iterations) long start = System.nanoTime(); while (iterations-- > 0) { ByteArrayInputStream compIn = new ByteArrayInputStream(compressed); - SnappyInputStream in = new SnappyInputStream(compIn, false); + SnappyFramedInputStream in = new SnappyFramedInputStream(compIn, false); while (in.read(inputBuffer) >= 0) { } @@ -287,12 +287,12 @@ public long roundTrip(TestData testData, long iterations) long start = System.nanoTime(); while (iterations-- > 0) { compressedStream.reset(); - SnappyOutputStream out = SnappyOutputStream.newChecksumFreeBenchmarkOutputStream(compressedStream); + SnappyFramedOutputStream out = SnappyFramedOutputStream.newChecksumFreeBenchmarkOutputStream(compressedStream); out.write(contents); out.close(); ByteArrayInputStream compIn = new ByteArrayInputStream(compressedStream.getBuffer(), 0, compressedStream.size()); - SnappyInputStream in = new SnappyInputStream(compIn, false); + SnappyFramedInputStream in = new SnappyFramedInputStream(compIn, false); while (in.read(inputBuffer) >= 0) { } @@ -314,7 +314,7 @@ public double getCompressionRatio(TestData testData) int compressedSize; try { ByteArrayOutputStream rawOut = new ByteArrayOutputStream(Snappy.maxCompressedLength(contents.length)); - SnappyOutputStream out = SnappyOutputStream.newChecksumFreeBenchmarkOutputStream(rawOut); + SnappyFramedOutputStream out = SnappyFramedOutputStream.newChecksumFreeBenchmarkOutputStream(rawOut); out.write(contents); out.close(); diff --git a/src/test/java/org/iq80/snappy/SnappyBench.java b/src/test/java/org/iq80/snappy/SnappyBench.java index 8674c56..46aa22a 100644 --- a/src/test/java/org/iq80/snappy/SnappyBench.java +++ b/src/test/java/org/iq80/snappy/SnappyBench.java @@ -116,11 +116,11 @@ public void verify() byte[] contents = testData.getContents(); ByteArrayOutputStream rawOut = new ByteArrayOutputStream(Snappy.maxCompressedLength(contents.length)); - SnappyOutputStream out = new SnappyOutputStream(rawOut); + SnappyFramedOutputStream out = new SnappyFramedOutputStream(rawOut); out.write(contents); out.close(); - SnappyInputStream in = new SnappyInputStream(new ByteArrayInputStream(rawOut.toByteArray())); + SnappyFramedInputStream in = new SnappyFramedInputStream(new ByteArrayInputStream(rawOut.toByteArray())); byte[] uncompressed = ByteStreams.toByteArray(in); if (!Arrays.equals(uncompressed, testData.getContents())) { diff --git a/src/test/java/org/iq80/snappy/SnappyFramedStreamTest.java b/src/test/java/org/iq80/snappy/SnappyFramedStreamTest.java index 67e4e2e..5a2e004 100644 --- a/src/test/java/org/iq80/snappy/SnappyFramedStreamTest.java +++ b/src/test/java/org/iq80/snappy/SnappyFramedStreamTest.java @@ -18,8 +18,6 @@ package org.iq80.snappy; import com.google.common.base.Charsets; -import org.testng.annotations.AfterTest; -import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; import java.io.ByteArrayInputStream; @@ -44,13 +42,6 @@ public class SnappyFramedStreamTest extends AbstractSnappyStreamTest { - @BeforeTest - @AfterTest - public void resetBufferRecycler() - { - BufferRecycler.instance().clear(); - } - @Override protected OutputStream createOutputStream(OutputStream target) throws IOException diff --git a/src/test/java/org/iq80/snappy/SnappyStreamTest.java b/src/test/java/org/iq80/snappy/SnappyStreamTest.java deleted file mode 100644 index 4f26f6d..0000000 --- a/src/test/java/org/iq80/snappy/SnappyStreamTest.java +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright (C) 2011 the original author or authors. - * See the notice.md file distributed with this work for additional - * information regarding copyright ownership. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.iq80.snappy; - -import com.google.common.base.Charsets; -import org.testng.annotations.Test; - -import java.io.ByteArrayInputStream; -import java.io.EOFException; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.nio.ByteBuffer; -import java.util.Arrays; - -import static com.google.common.io.ByteStreams.toByteArray; -import static com.google.common.primitives.UnsignedBytes.toInt; -import static org.iq80.snappy.SnappyOutputStream.STREAM_HEADER; -import static org.testng.Assert.assertEquals; - -public class SnappyStreamTest - extends AbstractSnappyStreamTest -{ - @Override - protected OutputStream createOutputStream(OutputStream target) - throws IOException - { - return new SnappyOutputStream(target); - } - - @Override - protected InputStream createInputStream(InputStream source, boolean verifyCheckSums) - throws IOException - { - return new SnappyInputStream(source, verifyCheckSums); - } - - @Override - protected byte[] getMarkerFrame() - { - return STREAM_HEADER; - } - - @Test - public void testSimple() - throws Exception - { - byte[] original = "aaaaaaaaaaaabbbbbbbaaaaaa".getBytes(Charsets.UTF_8); - - byte[] compressed = compress(original); - byte[] uncompressed = uncompress(compressed); - - assertEquals(uncompressed, original); - assertEquals(compressed.length, 33); // 7 byte stream header, 7 byte block header, 19 bytes compressed data - assertEquals(Arrays.copyOf(compressed, 7), STREAM_HEADER); // stream header - assertEquals(toInt(compressed[7]), 0x01); // flag: compressed - assertEquals(toInt(compressed[8]), 0x00); // length: 19 = 0x0013 - assertEquals(toInt(compressed[9]), 0x13); - assertEquals(toInt(compressed[10]), 0x92); // crc32c: 0x9274cda8 - assertEquals(toInt(compressed[11]), 0x74); - assertEquals(toInt(compressed[12]), 0xCD); - assertEquals(toInt(compressed[13]), 0xA8); - } - - @Test - public void testUncompressable() - throws Exception - { - byte[] random = getRandom(1, 5000); - int crc32c = Crc32C.maskedCrc32c(random); - - byte[] compressed = compress(random); - byte[] uncompressed = uncompress(compressed); - - assertEquals(uncompressed, random); - assertEquals(compressed.length, random.length + 7 + 7); - assertEquals(toInt(compressed[7]), 0x00); // flag: uncompressed - assertEquals(toInt(compressed[8]), 0x13); // length: 5000 = 0x1388 - assertEquals(toInt(compressed[9]), 0x88); - assertEquals(ByteBuffer.wrap(compressed, 10, 4).getInt(), crc32c); // crc: see above - } - - @Test - public void testEmptyCompression() - throws Exception - { - byte[] empty = new byte[0]; - assertEquals(compress(empty), STREAM_HEADER); - assertEquals(uncompress(STREAM_HEADER), empty); - } - - @Test(expectedExceptions = EOFException.class, expectedExceptionsMessageRegExp = ".*block header.*") - public void testShortBlockHeader() - throws Exception - { - uncompressBlock(new byte[] {0}); - } - - @Test(expectedExceptions = EOFException.class, expectedExceptionsMessageRegExp = ".*reading frame.*") - public void testShortBlockData() - throws Exception - { - uncompressBlock(new byte[] {0, 0, 4, 0, 0, 0, 0, 'x', 'x'}); // flag = 0, size = 4, crc32c = 0, block data = [x, x] - } - - @Test(expectedExceptions = IOException.class, expectedExceptionsMessageRegExp = "invalid compressed flag in header: 0x41") - public void testInvalidBlockHeaderCompressedFlag() - throws Exception - { - uncompressBlock(new byte[] {'A', 0, 1, 0, 0, 0, 0, 0}); // flag = 'A', block size = 1, crc32c = 0 - } - - @Test(expectedExceptions = IOException.class, expectedExceptionsMessageRegExp = "invalid block size in header: 0") - public void testInvalidBlockSizeZero() - throws Exception - { - uncompressBlock(new byte[] {0, 0, 0, 0, 0, 0, 0}); // flag = '0', block size = 0, crc32c = 0 - } - - @Test(expectedExceptions = IOException.class, expectedExceptionsMessageRegExp = "invalid block size in header: 55555") - public void testInvalidBlockSizeLarge() - throws Exception - { - uncompressBlock(new byte[] {0, (byte) 0xD9, 0x03, 0, 0, 0, 0}); // flag = 0, block size = 55555, crc32c = 0 - } - - @Test(expectedExceptions = IOException.class, expectedExceptionsMessageRegExp = "Corrupt input: invalid checksum") - public void testInvalidChecksum() - throws Exception - { - uncompressBlock(new byte[] {0, 0, 1, 0, 0, 0, 0, 'a'}); // flag = 0, size = 4, crc32c = 0, block data = [a] - } - - @Test - public void testInvalidChecksumIgnoredWhenVerificationDisabled() - throws Exception - { - byte[] block = {0, 0, 1, 0, 0, 0, 0, 'a'}; // flag = 0, size = 4, crc32c = 0, block data = [a] - ByteArrayInputStream inputData = new ByteArrayInputStream(blockToStream(block)); - assertEquals(toByteArray(createInputStream(inputData, false)), new byte[] {'a'}); - } - - private byte[] uncompressBlock(byte[] block) - throws IOException - { - return uncompress(blockToStream(block)); - } - - private static byte[] blockToStream(byte[] block) - { - byte[] stream = new byte[STREAM_HEADER.length + block.length]; - System.arraycopy(STREAM_HEADER, 0, stream, 0, STREAM_HEADER.length); - System.arraycopy(block, 0, stream, STREAM_HEADER.length, block.length); - return stream; - } -} diff --git a/src/test/java/org/iq80/snappy/SnappyTest.java b/src/test/java/org/iq80/snappy/SnappyTest.java index c8a49aa..2219240 100644 --- a/src/test/java/org/iq80/snappy/SnappyTest.java +++ b/src/test/java/org/iq80/snappy/SnappyTest.java @@ -22,16 +22,15 @@ import org.testng.annotations.Test; import java.io.File; -import java.util.Arrays; import java.util.Random; public class SnappyTest { private static final File TEST_DATA_DIR = new File("testdata"); - private RandomGenerator randomGenerator = new RandomGenerator(0.5); + private final RandomGenerator randomGenerator = new RandomGenerator(0.5); @Test - public void testByteForByteOutputSyntheticData() + public void testNativeCompatibleSyntheticData() throws Exception { for (int i = 1; i < 65 * 1024; i++) { @@ -39,13 +38,14 @@ public void testByteForByteOutputSyntheticData() verifyCompression(i); } catch (Error e) { + e.printStackTrace(); Assert.fail(i + " byte block", e); } } } @Test - public void testByteForByteTestData() + public void testNativeCompatibleTestData() throws Exception { for (File testFile : getTestFiles()) { @@ -54,12 +54,36 @@ public void testByteForByteTestData() verifyCompression(data, 0, data.length); } catch (Throwable e) { + e.printStackTrace(); Assert.fail("Testdata: " + testFile.getName(), e); - } } } + @Test(expectedExceptions = CorruptionException.class, expectedExceptionsMessageRegExp = "Malformed input: offset=5") + public void testInvalidLiteralLength() + { + byte[] data = { + // Encoded uncompressed length 1024 + -128, 8, + // op-code + (byte) 252, + // Trailer value Integer.MAX_VALUE + (byte) 0b1111_1111, (byte) 0b1111_1111, (byte) 0b1111_1111, (byte) 0b0111_1111, + // Some arbitrary data + 0, 0, 0, 0, 0, 0, 0, 0 + }; + + Snappy.uncompress(data, 0, data.length, new byte[1024], 0, 1024); + } + + @Test(expectedExceptions = CorruptionException.class, expectedExceptionsMessageRegExp = "negative compressed length: offset=16") + public void testNegativeLength() + { + byte[] data = {(byte) 255, (byte) 255, (byte) 255, (byte) 255, 0b0000_1000}; + Snappy.getUncompressedLength(data, 0); + } + private void verifyCompression(int size) throws Exception { @@ -72,16 +96,8 @@ private void verifyCompression(int size) private static void verifyCompression(byte[] input, int position, int size) throws Exception { - byte[] nativeCompressed = new byte[org.xerial.snappy.Snappy.maxCompressedLength(size)]; byte[] javaCompressed = new byte[Snappy.maxCompressedLength(size)]; - int nativeCompressedSize = org.xerial.snappy.Snappy.compress( - input, - position, - size, - nativeCompressed, - 0); - int javaCompressedSize = Snappy.compress( input, position, @@ -89,29 +105,34 @@ private static void verifyCompression(byte[] input, int position, int size) javaCompressed, 0); - // verify outputs are exactly the same - String failureMessage = "Invalid compressed output for input size " + size + " at offset " + position; - if (!SnappyInternalUtils.equals(javaCompressed, 0, nativeCompressed, 0, nativeCompressedSize)) { - if (nativeCompressedSize < 100) { - Assert.assertEquals( - Arrays.toString(Arrays.copyOf(javaCompressed, nativeCompressedSize)), - Arrays.toString(Arrays.copyOf(nativeCompressed, nativeCompressedSize)), - failureMessage - ); - } - else { - Assert.fail(failureMessage); - } - } - Assert.assertEquals(javaCompressedSize, nativeCompressedSize); - - // verify the contents can be uncompressed + // Verify Java codec decompresses Java compressed data byte[] uncompressed = new byte[size]; - Snappy.uncompress(javaCompressed, 0, javaCompressedSize, uncompressed, 0); + int uncompressedSize = Snappy.uncompress(javaCompressed, 0, javaCompressedSize, uncompressed, 0); + Assert.assertEquals(uncompressedSize, size, "Size mismatch"); + Assert.assertTrue(arraysEqual(input, position, uncompressed, 0, size), "Data mismatch"); - if (!SnappyInternalUtils.equals(uncompressed, 0, input, position, size)) { - Assert.fail("Invalid uncompressed output for input size " + size + " at offset " + position); - } + // Verify Native codec decompresses Java compressed data + byte[] nativeUncompressed = new byte[size]; + int nativeUncompressedSize = org.xerial.snappy.Snappy.uncompress( + javaCompressed, + 0, + javaCompressedSize, + nativeUncompressed, + 0); + Assert.assertEquals(nativeUncompressedSize, size, "Size mismatch"); + Assert.assertTrue(arraysEqual(input, position, nativeUncompressed, 0, size), "Data mismatch"); + + // Verify Java codec decompresses Native compressed data + byte[] nativeCompressed = new byte[org.xerial.snappy.Snappy.maxCompressedLength(size)]; + int nativeCompressedSize = org.xerial.snappy.Snappy.compress( + input, + position, + size, + nativeCompressed, + 0); + uncompressedSize = Snappy.uncompress(nativeCompressed, 0, nativeCompressedSize, uncompressed, 0); + Assert.assertEquals(uncompressedSize, size, "Size mismatch"); + Assert.assertTrue(arraysEqual(input, position, uncompressed, 0, size), "Data mismatch"); } public static class RandomGenerator @@ -177,4 +198,14 @@ static File[] getTestFiles() Assert.assertTrue(testFiles != null && testFiles.length > 0, "No test files at " + TEST_DATA_DIR.getAbsolutePath()); return testFiles; } + + private static boolean arraysEqual(byte[] left, int leftIndex, byte[] right, int rightIndex, int length) + { + for (int i = 0; i < length; i++) { + if (left[leftIndex + i] != right[rightIndex + i]) { + return false; + } + } + return true; + } }