/* * #%L * BSD implementations of Bio-Formats readers and writers * %% * Copyright (C) 2005 - 2015 Open Microscopy Environment: * - Board of Regents of the University of Wisconsin-Madison * - Glencoe Software, Inc. * - University of Dundee * %% * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * #L% */ package loci.formats.codec; import java.io.EOFException; import java.io.IOException; import java.util.Arrays; import loci.common.RandomAccessInputStream; import loci.formats.FormatException; /** * This is an optimized LZW codec for use with TIFF files. * Most of the code is inlined, and specifics of TIFF usage of LZW * (known size of decompressor output; possible lengths of LZW codes; specified * values for <code>CLEAR</code> and <code>END_OF_INFORMATION</code> codes) * are taken in account. * <p> * Estimating the worst-case size of compressor output: * <ul> * <li> The worst case means that there is no compression at all, and every * input byte generates code to output. * <li> This means that the LZW table will be full (and reset) after reading * each portion of 4096-256-2-1=3837 bytes of input * (first 256 codes are preallocated, 2 codes are used for CLEAR and * END_OF_IFORMATION, 1 code is lost due to original bug in TIFF library * that now is a feature). * <li> Each full portion of 3837 byte will produce in output: * <ul> * <li> 9 bits for CLEAR code; * <li> 9*253 + 10*512 + 11*1024 + 12*2048 = 43237 bits for character codes. * </ul> * <li> Let n=3837, m=(number of bytes in the last incomplete portion), * N=(number of bytes in compressed complete portion with CLEAR code), * M=(number of bytes in compressed last incomplete portion). * We have inequalities: * <ul> * <li> N <= 1.41 * n * <li> M <= 1.41 * m * <li> The last incomplete portion should also include CLEAR and * END_OF_INFORMATION codes; they occupy less than 3 bytes. * </ul> * Thus, we can claim than the number of bytes in compressed output never * exceeds 1.41*(number of input bytes)+3. * <p> * * @author Mikhail Kovtun mikhail.kovtun at duke.edu */ public class LZWCodec extends BaseCodec { /** * Size of hash table. Must be greater 3837 (the number of possible codes). * Bigger size reduces number of rehashing steps -- * at expence of initialization time. */ private static final int HASH_SIZE = 7349; /** Rehashing step. HASH_SIZE and HASH_STEP shoulg be coprime. */ private static final int HASH_STEP = 257; private static final int CLEAR_CODE = 256; private static final int EOI_CODE = 257; private static final int FIRST_CODE = 258; /** Masks for writing bits in compressor. */ private static final int[] COMPR_MASKS = {0xff, 0x7f, 0x3f, 0x1f, 0x0f, 0x07, 0x03, 0x01}; /** Masks for reading bits in decompressor. */ private static final int[] DECOMPR_MASKS = {0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f}; /* @see Codec#compress(byte[], CodecOptions) */ @Override public byte[] compress(byte[] input, CodecOptions options) throws FormatException { if (input == null || input.length == 0) return input; // Output buffer (see class comments for justification of size). long bufferSize = ((long) input.length * 141) / 100 + 3; if (bufferSize > Integer.MAX_VALUE) { throw new FormatException("Output buffer is greater than 2 GB"); } byte[] output = new byte[(int) bufferSize]; // Current size of output buffer (and position to write next byte). int outSize = 0; // The output always starts with CLEAR code output[outSize++] = (byte) (CLEAR_CODE >> 1); // Last incomplete byte to be written to output (bits shifted to the right). // Always contains at least 1 bit, and may contain 8 bits. int currOutByte = CLEAR_CODE & 0x01; // Number of unused bits in currOutByte (from 0 to 7). int freeBits = 7; // Hash table. // Keys in the table are pairs (code,byte) and values are codes. // Pair (code,byte) is represented as ( (code<<8) | byte ). // Unused table entries have key=-1. int[] htKeys = new int[HASH_SIZE]; int[] htValues = new int[HASH_SIZE]; // Initialize hash table: mark all entries as unused Arrays.fill(htKeys, -1); // Next code to be used by compressor. int nextCode = FIRST_CODE; // Number of bits to be used to output code. Ranges from 9 to 12. int currCodeLength = 9; // Names of these variables are taken from TIFF specification. // The first byte of input is handled specially. int tiffK = input[0] & 0xff; int tiffOmega = tiffK; // Main loop. for (int currInPos=1; currInPos<input.length; currInPos++) { tiffK = input[currInPos] & 0xff; int hashKey = (tiffOmega << 8) | tiffK; int hashCode = hashKey % HASH_SIZE; do { if (htKeys[hashCode] == hashKey) { // Omega+K in the table tiffOmega = htValues[hashCode]; break; } else if (htKeys[hashCode] < 0) { // Omega+K not in the table // 1) add new entry to hash table htKeys[hashCode] = hashKey; htValues[hashCode] = nextCode++; // 2) output last code int shift = currCodeLength - freeBits; output[outSize++] = (byte) ((currOutByte << freeBits) | (tiffOmega >> shift)); if (shift > 8) { output[outSize++] = (byte) (tiffOmega >> (shift - 8)); shift -= 8; } freeBits = 8 - shift; currOutByte = tiffOmega & COMPR_MASKS[freeBits]; // 3) omega = K tiffOmega = tiffK; break; } else { // we have to rehash hashCode = (hashCode + HASH_STEP) % HASH_SIZE; }; } while (true); switch (nextCode) { case 512: currCodeLength = 10; break; case 1024: currCodeLength = 11; break; case 2048: currCodeLength = 12; break; case 4096: // write CLEAR code and reinitialize hash table int shift = currCodeLength - freeBits; output[outSize++] = (byte) ((currOutByte << freeBits) | (CLEAR_CODE >> shift)); if (shift > 8) { output[outSize++] = (byte) (CLEAR_CODE >> (shift - 8)); shift -= 8; } freeBits = 8 - shift; currOutByte = CLEAR_CODE & COMPR_MASKS[freeBits]; Arrays.fill(htKeys, -1); nextCode = FIRST_CODE; currCodeLength = 9; break; } } // End of input: // 1) write code from tiff_Omega { int shift = currCodeLength - freeBits; output[outSize++] = (byte) ((currOutByte << freeBits) | (tiffOmega >> shift)); if (shift > 8) { output[outSize++] = (byte) (tiffOmega >> (shift - 8)); shift -= 8; } freeBits = 8 - shift; currOutByte = tiffOmega & COMPR_MASKS[freeBits]; } // 2) write END_OF_INFORMATION code // -- we write the last incomplete byte here as well // !!! We have to increase length of code if needed !!! switch (nextCode) { case 511: currCodeLength = 10; break; case 1023: currCodeLength = 11; break; case 2047: currCodeLength = 12; break; } { int shift = currCodeLength - freeBits; output[outSize++] = (byte) ((currOutByte << freeBits) | (EOI_CODE >> shift)); if (shift > 8) { output[outSize++] = (byte) (EOI_CODE >> (shift - 8)); shift -= 8; } freeBits = 8 - shift; currOutByte = EOI_CODE & COMPR_MASKS[freeBits]; output[outSize++] = (byte) (currOutByte << freeBits); } byte[] result = new byte[outSize]; System.arraycopy(output, 0, result, 0, outSize); return result; } /** * The CodecOptions parameter should have the following fields set: * {@link CodecOptions#maxBytes maxBytes} * * @see Codec#decompress(RandomAccessInputStream, CodecOptions) */ @Override public byte[] decompress(RandomAccessInputStream in, CodecOptions options) throws FormatException, IOException { if (in == null || in.length() == 0) return null; if (options == null) options = CodecOptions.getDefaultOptions(); // Output buffer byte[] output = new byte[options.maxBytes]; // Position in output buffer to write next byte to int currOutPos = 0; // Table mapping codes to strings. // Its structure is based on the fact that a string for a code has form: // (string for another code) + (new byte). // Thus, at index 'code': first array contains 'another code', second array // contains 'new byte', and third array contains length of the string. // The length is needed to make retrieving the string faster. int[] anotherCodes = new int[4096]; byte[] newBytes = new byte[4096]; int[] lengths = new int[4096]; // We need to initialize only firt 256 entries in the table for (int i=0; i<256; i++) { newBytes[i] = (byte) i; lengths[i] = 1; } // Length of the code to be read from input int currCodeLength = 9; // Next code to be added to the table int nextCode = FIRST_CODE; // Variables to handle reading bit stream: // Byte from 'input[curr_in_pos-1]' -- only 'bits_read' bits on the right // are non-zero int currRead = 0; // Number of bits in 'curr_read' that were not consumed yet int bitsRead = 0; // Current code being processed by decompressor. int currCode; // Previous code processed by decompressor. int oldCode = 0; // without initializer, Java reports error later try { do { // read next code { int bitsLeft = currCodeLength - bitsRead; if (bitsLeft > 8) { currRead = (currRead << 8) | (in.read() & 0xff); bitsLeft -= 8; } bitsRead = 8 - bitsLeft; int nextByte = in.read() & 0xff; currCode = (currRead << bitsLeft) | (nextByte >> bitsRead); currRead = nextByte & DECOMPR_MASKS[bitsRead]; } if (currCode == EOI_CODE) break; if (currCode == CLEAR_CODE) { // initialize table -- nothing to do nextCode = FIRST_CODE; currCodeLength = 9; // read next code { int bitsLeft = currCodeLength - bitsRead; if (bitsLeft > 8) { currRead = (currRead << 8) | (in.read() & 0xff); bitsLeft -= 8; } bitsRead = 8 - bitsLeft; int nextByte = in.read() & 0xff; currCode = (currRead << bitsLeft) | (nextByte >> bitsRead); currRead = nextByte & DECOMPR_MASKS[bitsRead]; } if (currCode == EOI_CODE) break; // write string[curr_code] to output // -- but here we are sure that string consists of a single byte if (currOutPos >= output.length) break; output[currOutPos++] = newBytes[currCode]; oldCode = currCode; } else if (currCode < nextCode) { // Code is already in the table // 1) Write strin[curr_code] to output int outLength = lengths[currCode]; int i = currOutPos + outLength; int tablePos = currCode; while (i > output.length) { tablePos = anotherCodes[tablePos]; i--; } while (i > currOutPos) { output[--i] = newBytes[tablePos]; tablePos = anotherCodes[tablePos]; } if (i >= output.length) break; currOutPos += outLength; // 2) Add string[old_code]+firstByte(string[curr_code]) to the table if (nextCode >= anotherCodes.length) break; anotherCodes[nextCode] = oldCode; newBytes[nextCode] = output[i]; lengths[nextCode] = lengths[oldCode] + 1; oldCode = currCode; nextCode++; } else { // Special case: code is not in the table // 1) Write string[old_code] to output int outLength = lengths[oldCode]; int i = currOutPos + outLength; int tablePos = oldCode; if (i > output.length) break; while (i > currOutPos) { output[--i] = newBytes[tablePos]; tablePos = anotherCodes[tablePos]; } currOutPos += outLength; // 2) Write firstByte(string[old_code]) to output if (currOutPos > output.length - 1) break; output[currOutPos++] = output[i]; // 3) Add string[old_code]+firstByte(string[old_code]) to the table anotherCodes[nextCode] = oldCode; newBytes[nextCode] = output[i]; lengths[nextCode] = outLength + 1; oldCode = currCode; nextCode++; } // Increase length of code if needed switch (nextCode) { case 511: currCodeLength = 10; break; case 1023: currCodeLength = 11; break; case 2047: currCodeLength = 12; break; } } while (currOutPos < output.length && in.getFilePointer() < in.length()); } catch (ArrayIndexOutOfBoundsException e) { throw new FormatException("Invalid LZW data", e); } catch (EOFException e) { } return output; } }