LZWCodec.java example

Explorer
bioformats-master
- components
- docs
  - sphinx
    - developers
/*
 * #%L
 * BSD implementations of Bio-Formats readers and writers
 * %%
 * Copyright (C) 2005 - 2015 Open Microscopy Environment:
 *   - Board of Regents of the University of Wisconsin-Madison
 *   - Glencoe Software, Inc.
 *   - University of Dundee
 * %%
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 * #L%
 */

package loci.formats.codec;

import java.io.EOFException;
import java.io.IOException;
import java.util.Arrays;

import loci.common.RandomAccessInputStream;
import loci.formats.FormatException;

/**
 * This is an optimized LZW codec for use with TIFF files.
 * Most of the code is inlined, and specifics of TIFF usage of LZW
 * (known size of decompressor output; possible lengths of LZW codes; specified
 * values for <code>CLEAR</code> and <code>END_OF_INFORMATION</code> codes)
 * are taken in account.
 * <p>
 * Estimating the worst-case size of compressor output:
 * <ul>
 * <li> The worst case means that there is no compression at all, and every
 *      input byte generates code to output.
 * <li> This means that the LZW table will be full (and reset) after reading
 *      each portion of 4096-256-2-1=3837 bytes of input
 *      (first 256 codes are preallocated, 2 codes are used for CLEAR and
 *      END_OF_IFORMATION, 1 code is lost due to original bug in TIFF library
 *      that now is a feature).
 * <li> Each full portion of 3837 byte will produce in output:
 * <ul>
 * <li> 9 bits for CLEAR code;
 * <li> 9*253 + 10*512 + 11*1024 + 12*2048 = 43237 bits for character codes.
 * </ul>
 * <li> Let n=3837, m=(number of bytes in the last incomplete portion),
 *      N=(number of bytes in compressed complete portion with CLEAR code),
 *      M=(number of bytes in compressed last incomplete portion).
 *      We have inequalities:
 * <ul>
 * <li> N <= 1.41 * n
 * <li> M <= 1.41 * m
 * <li> The last incomplete portion should also include CLEAR and
 *      END_OF_INFORMATION codes; they occupy less than 3 bytes.
 * </ul>
 * Thus, we can claim than the number of bytes in compressed output never
 * exceeds 1.41*(number of input bytes)+3.
 * <p>
 *
 * @author Mikhail Kovtun mikhail.kovtun at duke.edu
 */
public class LZWCodec extends BaseCodec {

  /**
   * Size of hash table. Must be greater 3837 (the number of possible codes).
   * Bigger size reduces number of rehashing steps --
   * at expence of initialization time.
   */
  private static final int HASH_SIZE = 7349;

  /** Rehashing step. HASH_SIZE and HASH_STEP shoulg be coprime. */
  private static final int HASH_STEP = 257;

  private static final int CLEAR_CODE = 256;
  private static final int EOI_CODE = 257;
  private static final int FIRST_CODE = 258;

  /** Masks for writing bits in compressor. */
  private static final int[] COMPR_MASKS =
    {0xff, 0x7f, 0x3f, 0x1f, 0x0f, 0x07, 0x03, 0x01};

  /** Masks for reading bits in decompressor. */
  private static final int[] DECOMPR_MASKS =
    {0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f};

  /* @see Codec#compress(byte[], CodecOptions) */
  @Override
  public byte[] compress(byte[] input, CodecOptions options)
    throws FormatException
  {
    if (input == null || input.length == 0) return input;

    // Output buffer (see class comments for justification of size).
    long bufferSize = ((long) input.length * 141) / 100 + 3;
    if (bufferSize > Integer.MAX_VALUE) {
      throw new FormatException("Output buffer is greater than 2 GB");
    }
    byte[] output = new byte[(int) bufferSize];

    // Current size of output buffer (and position to write next byte).
    int outSize = 0;
    // The output always starts with CLEAR code
    output[outSize++] = (byte) (CLEAR_CODE >> 1);
    // Last incomplete byte to be written to output (bits shifted to the right).
    // Always contains at least 1 bit, and may contain 8 bits.
    int currOutByte = CLEAR_CODE & 0x01;
    // Number of unused bits in currOutByte (from 0 to 7).
    int freeBits = 7;

    // Hash table.
    // Keys in the table are pairs (code,byte) and values are codes.
    // Pair (code,byte) is represented as ( (code<<8) | byte ).
    // Unused table entries have key=-1.
    int[] htKeys   = new int[HASH_SIZE];
    int[] htValues = new int[HASH_SIZE];
    // Initialize hash table: mark all entries as unused
    Arrays.fill(htKeys, -1);

    // Next code to be used by compressor.
    int nextCode = FIRST_CODE;
    // Number of bits to be used to output code. Ranges from 9 to 12.
    int currCodeLength = 9;

    // Names of these variables are taken from TIFF specification.
    // The first byte of input is handled specially.
    int tiffK = input[0] & 0xff;
    int tiffOmega = tiffK;

    // Main loop.
    for (int currInPos=1; currInPos<input.length; currInPos++) {
      tiffK = input[currInPos] & 0xff;
      int hashKey = (tiffOmega << 8) | tiffK;
      int hashCode = hashKey % HASH_SIZE;
      do {
        if (htKeys[hashCode] == hashKey) {
          // Omega+K in the table
          tiffOmega = htValues[hashCode];
          break;
        }
        else if (htKeys[hashCode] < 0) {
          // Omega+K not in the table
          // 1) add new entry to hash table
          htKeys[hashCode] = hashKey;
          htValues[hashCode] = nextCode++;
          // 2) output last code
          int shift = currCodeLength - freeBits;
          output[outSize++] =
            (byte) ((currOutByte << freeBits) | (tiffOmega >> shift));
          if (shift > 8) {
            output[outSize++] = (byte) (tiffOmega >> (shift - 8));
            shift -= 8;
          }
          freeBits = 8 - shift;
          currOutByte = tiffOmega & COMPR_MASKS[freeBits];
          // 3) omega = K
          tiffOmega = tiffK;
          break;
        }
        else {
          // we have to rehash
          hashCode = (hashCode + HASH_STEP) % HASH_SIZE;
        };
      } while (true);

      switch (nextCode) {
        case 512:
          currCodeLength = 10;
          break;
        case 1024:
          currCodeLength = 11;
          break;
        case 2048:
          currCodeLength = 12;
          break;
        case 4096:  // write CLEAR code and reinitialize hash table
         int shift = currCodeLength - freeBits;
         output[outSize++] =
           (byte) ((currOutByte << freeBits) | (CLEAR_CODE >> shift));
         if (shift > 8) {
           output[outSize++] = (byte) (CLEAR_CODE >> (shift - 8));
           shift -= 8;
         }
         freeBits = 8 - shift;
         currOutByte = CLEAR_CODE & COMPR_MASKS[freeBits];
         Arrays.fill(htKeys, -1);
         nextCode = FIRST_CODE;
         currCodeLength = 9;
         break;
      }
    }

    // End of input:
    // 1) write code from tiff_Omega
    {
      int shift = currCodeLength - freeBits;
      output[outSize++] =
        (byte) ((currOutByte << freeBits) | (tiffOmega >> shift));
      if (shift > 8) {
        output[outSize++] = (byte) (tiffOmega >> (shift - 8));
        shift -= 8;
      }
      freeBits = 8 - shift;
      currOutByte = tiffOmega & COMPR_MASKS[freeBits];
    }
    // 2) write END_OF_INFORMATION code
    //    -- we write the last incomplete byte here as well
    // !!! We have to increase length of code if needed !!!
    switch (nextCode) {
      case 511:
        currCodeLength = 10;
        break;
      case 1023:
        currCodeLength = 11;
        break;
      case 2047:
        currCodeLength = 12;
        break;
    }

    {
      int shift = currCodeLength - freeBits;
      output[outSize++] =
        (byte) ((currOutByte << freeBits) | (EOI_CODE >> shift));
      if (shift > 8) {
        output[outSize++] = (byte) (EOI_CODE >> (shift - 8));
        shift -= 8;
      }
      freeBits = 8 - shift;
      currOutByte = EOI_CODE & COMPR_MASKS[freeBits];
      output[outSize++] = (byte) (currOutByte << freeBits);
    }

    byte[] result = new byte[outSize];
    System.arraycopy(output, 0, result, 0, outSize);
    return result;
  }

  /**
   * The CodecOptions parameter should have the following fields set:
   *  {@link CodecOptions#maxBytes maxBytes}
   *
   * @see Codec#decompress(RandomAccessInputStream, CodecOptions)
   */
  @Override
  public byte[] decompress(RandomAccessInputStream in, CodecOptions options)
    throws FormatException, IOException
  {
    if (in == null || in.length() == 0) return null;
    if (options == null) options = CodecOptions.getDefaultOptions();

    // Output buffer
    byte[] output = new byte[options.maxBytes];
    // Position in output buffer to write next byte to
    int currOutPos = 0;

    // Table mapping codes to strings.
    // Its structure is based on the fact that a string for a code has form:
    // (string for another code) + (new byte).
    // Thus, at index 'code': first array contains 'another code', second array
    // contains 'new byte', and third array contains length of the string.
    // The length is needed to make retrieving the string faster.
    int[] anotherCodes = new int[4096];
    byte[] newBytes = new byte[4096];
    int[] lengths = new int[4096];
    // We need to initialize only firt 256 entries in the table
    for (int i=0; i<256; i++) {
      newBytes[i] = (byte) i;
      lengths[i] = 1;
    }

    // Length of the code to be read from input
    int currCodeLength = 9;
    // Next code to be added to the table
    int nextCode = FIRST_CODE;

    // Variables to handle reading bit stream:
    // Byte from 'input[curr_in_pos-1]' -- only 'bits_read' bits on the right
    // are non-zero
    int currRead = 0;
    // Number of bits in 'curr_read' that were not consumed yet
    int bitsRead = 0;

    // Current code being processed by decompressor.
    int currCode;
    // Previous code processed by decompressor.
    int oldCode = 0;   // without initializer, Java reports error later

    try {
      do {
        // read next code
        {
          int bitsLeft = currCodeLength - bitsRead;
          if (bitsLeft > 8) {
            currRead = (currRead << 8) | (in.read() & 0xff);
            bitsLeft -= 8;
          }
          bitsRead = 8 - bitsLeft;
          int nextByte = in.read() & 0xff;
          currCode = (currRead << bitsLeft) | (nextByte >> bitsRead);
          currRead = nextByte & DECOMPR_MASKS[bitsRead];
        }

        if (currCode == EOI_CODE) break;

        if (currCode == CLEAR_CODE) {
          // initialize table -- nothing to do
          nextCode = FIRST_CODE;
          currCodeLength = 9;
          // read next code
          {
            int bitsLeft = currCodeLength - bitsRead;
            if (bitsLeft > 8) {
              currRead = (currRead << 8) | (in.read() & 0xff);
              bitsLeft -= 8;
            }
            bitsRead = 8 - bitsLeft;

            int nextByte = in.read() & 0xff;
            currCode = (currRead << bitsLeft) | (nextByte >> bitsRead);
            currRead = nextByte & DECOMPR_MASKS[bitsRead];
          }
          if (currCode == EOI_CODE) break;
            // write string[curr_code] to output
            // -- but here we are sure that string consists of a single byte
            if (currOutPos >= output.length) break;
            output[currOutPos++] = newBytes[currCode];
            oldCode = currCode;
        }
        else if (currCode < nextCode) {
          // Code is already in the table
          // 1) Write strin[curr_code] to output
          int outLength = lengths[currCode];
          int i = currOutPos + outLength;
          int tablePos = currCode;
          while (i > output.length) {
            tablePos = anotherCodes[tablePos];
            i--;
          }
          while (i > currOutPos) {
            output[--i] = newBytes[tablePos];
            tablePos = anotherCodes[tablePos];
          }
          if (i >= output.length) break;
          currOutPos += outLength;
          // 2) Add string[old_code]+firstByte(string[curr_code]) to the table
          if (nextCode >= anotherCodes.length) break; 
          anotherCodes[nextCode] = oldCode;
          newBytes[nextCode] = output[i];
          lengths[nextCode] = lengths[oldCode] + 1;
          oldCode = currCode;
          nextCode++;
        }
        else {
          // Special case: code is not in the table
          // 1) Write string[old_code] to output
          int outLength = lengths[oldCode];
          int i = currOutPos + outLength;
          int tablePos = oldCode;
          if (i > output.length) break;
          while (i > currOutPos) {
            output[--i] = newBytes[tablePos];
            tablePos = anotherCodes[tablePos];
          }
          currOutPos += outLength;
          // 2) Write firstByte(string[old_code]) to output
          if (currOutPos > output.length - 1) break;
          output[currOutPos++] = output[i];
          // 3) Add string[old_code]+firstByte(string[old_code]) to the table
          anotherCodes[nextCode] = oldCode;
          newBytes[nextCode] = output[i];
          lengths[nextCode] = outLength + 1;
          oldCode = currCode;
          nextCode++;
        }
        // Increase length of code if needed
        switch (nextCode) {
          case 511:
            currCodeLength = 10;
            break;
          case 1023:
            currCodeLength = 11;
            break;
          case 2047:
            currCodeLength = 12;
            break;
        }
      } while (currOutPos < output.length && in.getFilePointer() < in.length());
    }
    catch (ArrayIndexOutOfBoundsException e) {
      throw new FormatException("Invalid LZW data", e);
    }
    catch (EOFException e) { }
    return output;
  }
}