CharsetValidation.java example

Explorer
sonarqube-master
/*
 * SonarQube
 * Copyright (C) 2009-2017 SonarSource SA
 * mailto:info AT sonarsource DOT com
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 3 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */
package org.sonar.scanner.scan.filesystem;

import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;

import javax.annotation.CheckForNull;
import javax.annotation.Nullable;

public class CharsetValidation {

  private static final double UTF_16_NULL_PASS_THRESHOLD = 0.7;
  private static final double UTF_16_NULL_FAIL_THRESHOLD = 0.1;

  /**
   * Checks if an array of bytes looks UTF-16 encoded. 
   * We look for clues by checking the presence of nulls and new line control chars in both little and big endian byte orders.
   * Failing on nulls will greatly reduce FPs if the buffer is actually encoded in UTF-32.
   * 
   * Note that for any unicode between 0-255, UTF-16 encodes it directly in 2 bytes, being the first 0 (null). Since ASCII, ANSI and control chars are 
   * within this range, we look for number of nulls and see if it is above a certain threshold.
   * It's possible to have valid chars that map to the opposite (non-null followed by a null) even though it is very unlike. 
   * That will happen, for example, for any unicode 0x??00, being ?? between 00 and D7. For this reason, we give a small maximum tolerance 
   * for opposite nulls (10%).
   * 
   * Line feed code point (0x000A) reversed would be (0x0A00). This code point is reserved and should never be found.
   * 
   */
  public Result isUTF16(byte[] buffer, boolean failOnNull) {
    if (buffer.length < 2) {
      return Result.INVALID;
    }

    int beAscii = 0;
    int beLines = 0;
    int leAscii = 0;
    int leLines = 0;

    for (int i = 0; i < buffer.length / 2; i++) {
      // using bytes is fine, since we will compare with positive numbers only
      byte c1 = buffer[i * 2];
      byte c2 = buffer[i * 2 + 1];

      if (c1 == 0) {
        if (c2 != 0) {
          if (c2 == 0x0a || c2 == 0x0d) {
            beLines++;
          }
          beAscii++;
        } else if (failOnNull) {
          // it's probably UTF-32 or binary
          return Result.INVALID;
        }
      } else if (c2 == 0) {
        leAscii++;
        if (c1 == 0x0a || c1 == 0x0d) {
          leLines++;
        }
      }
    }

    double beAsciiPerc = beAscii * 2.0 / (double) buffer.length;
    double leAsciiPerc = leAscii * 2.0 / (double) buffer.length;

    if (leLines == 0) {
      // could be BE
      if (beAsciiPerc >= UTF_16_NULL_PASS_THRESHOLD && leAsciiPerc < UTF_16_NULL_FAIL_THRESHOLD) {
        return Result.newValid(StandardCharsets.UTF_16BE);
      }
      if (beLines > 0) {
        // this gives FPs for UTF-32 if !failOnNull
        return Result.newValid(StandardCharsets.UTF_16BE);
      }
    } else if (beLines > 0) {
      // lines detected with both endiness -> can't be utf-16
      return Result.INVALID;
    }
    if (beLines == 0) {
      // could be BE
      if (leAsciiPerc >= UTF_16_NULL_PASS_THRESHOLD && beAsciiPerc < UTF_16_NULL_FAIL_THRESHOLD) {
        return Result.newValid(StandardCharsets.UTF_16LE);
      }
      if (leLines > 0) {
        // this gives FPs for UTF-32 if !failOnNull
        return Result.newValid(StandardCharsets.UTF_16LE);
      }
    }

    // if we reach here, means that there wasn't a line feed for a single endiness and we didn't see a strong null pattern for any of the
    // endiness.
    // It could happen if there are no line feeds in the text and it's a language that does not use ANSI (unicode > 255).
    return new Result(Validation.MAYBE, null);
  }

  /**
   * Checks whether it's a valid UTF-16-encoded buffer. 
   * Most sequences of bytes of any encoding will be valid UTF-16, so this is not very effective and gives
   * often false positives.
   * 
   * Possible 16bit values in UTF-16:
   * 
   * 0x0000-0xD7FF: single 16bit block
   * 0xD800-0xDBFF: first block
   * 0xDC00-0xDFFF: second block
   * 0XE000-0xFFFF: single 16 bit block
   * 
   * The following UTF code points get mapped into 1 or 2 blocks:
   * 0x0000 -0xD7FF   (0    -55295)  : 2 bytes, direct mapping
   * 0xE000 -0xFFFF   (57344-65535)  : 2 bytes, direct mapping
   * 0x10000-0x10FFFF (65536-1114111): 2 blocks of 2 bytes (not direct..)
   * 
   * Note that Unicode 55296-57345 (0xD800 to 0xDFFF) are not used, since it's reserved and used in UTF-16 for the high/low surrogates.
   * 
   * We reject 2-byte blocks with 0 (we consider it's binary) even though it's a valid UTF-16 encoding.
   * 
   */
  public boolean isValidUTF16(byte[] buffer) {
    return isValidUTF16(buffer, false);
  }

  public boolean isValidUTF16(byte[] buffer, boolean le) {
    if (buffer.length < 2) {
      return false;
    }
    for (int i = 0; i < buffer.length / 2; i++) {
      boolean extraByte = false;
      int c = read16bit(buffer, i, le);

      if (c >= 0xD800 && c < 0xDC00) {
        // it's a higher surrogate (10 bits)
        extraByte = true;
        i++;
      } else if ((c >= 0xDC00 && c < 0xE000) || c == 0) {
        return false;
      }
      // else it is a simple 2 byte encoding (code points in BMP), and it's valid

      if (extraByte && i < buffer.length / 2) {
        c = read16bit(buffer, i, le);
        if (c < 0xDC00 || c >= 0xE000) {
          // invalid lower surrogate (10 bits)
          return false;
        }
      }
    }
    return true;
  }

  /**
   * Checks if a buffer contains only valid UTF8 encoded bytes. 
   * It's very effective, giving a clear YES/NO, unless it's ASCII  (unicode < 127), in which case it returns MAYBE.
   * 
   * 
   * First byte:
   * 0xxxxxxx: only one byte (0-127)
   * 110xxxxx: 2 bytes       (194-223, as 192/193 are invalid)
   * 1110xxxx: 3 bytes       (224-239)
   * 11110xxx: 4 bytes       (240-244)
   * 
   * Bytes 2,3 and 4 are always 10xxxxxx (0x80-0xBF or 128-191).
   * 
   * So depending on the number of significant bits in the unicode code point, the length will be 1,2,3 or 4 bytes:
   * 0 -7 bits  (0x0000-007F):  1 byte encoding
   * 8 -11 bits (0x0080-07FF): 2 bytes encoding
   * 12-16 bits (0x0800-FFFF): 3 bytes encoding
   * 17-21 bits (0x10000-10FFFF): 4 bytes encoding
   */
  public Result isUTF8(byte[] buffer, boolean rejectNulls) {
    boolean onlyAscii = true;

    for (int i = 0; i < buffer.length; i++) {
      byte len;
      // make it unsigned for the comparisons
      int c = (0xFF) & buffer[i];

      if (rejectNulls && c == 0) {
        return Result.INVALID;
      }
      if ((c & 0b10000000) == 0) {
        len = 0;
      } else if (c >= 194 && c < 224) {
        len = 1;
      } else if ((c & 0b11110000) == 0b11100000) {
        len = 2;
      } else if ((c & 0b11111000) == 0b11110000) {
        len = 3;
      } else {
        return Result.INVALID;
      }

      while (len > 0) {
        i++;
        if (i >= buffer.length) {
          break;
        }
        c = (0xFF) & buffer[i];
        onlyAscii = false;

        // first 2 bits should be 10
        if ((c & 0b11000000) != 0b10000000) {
          return Result.INVALID;
        }
        len--;
      }
    }

    return onlyAscii ? new Result(Validation.MAYBE, StandardCharsets.UTF_8) : Result.newValid(StandardCharsets.UTF_8);
  }

  /**
   * Tries to use the given charset to decode the byte array.
   * @return true if decoding succeeded, false if there was a decoding error.
   */
  public boolean tryDecode(byte[] bytes, @Nullable Charset charset) {
    if (charset == null) {
      return false;
    }
    CharsetDecoder decoder = charset.newDecoder()
      .onMalformedInput(CodingErrorAction.REPORT)
      .onUnmappableCharacter(CodingErrorAction.REPORT);

    try {
      decoder.decode(ByteBuffer.wrap(bytes));
    } catch (CharacterCodingException e) {
      return false;
    }
    return true;
  }

  private static int read16bit(byte[] buffer, int i, boolean le) {
    return le ? (buffer[i / 2] & 0xff) | ((buffer[i / 2 + 1] & 0xff) << 8)
      : ((buffer[i / 2] & 0xff) << 8) | (buffer[i / 2 + 1] & 0xff);
  }

  public enum Validation {
    NO,
    YES,
    MAYBE
  }

  public static class Result {
    static final Result INVALID = new Result(Validation.NO, null);
    private Validation valid;
    private Charset charset;

    public Result(Validation valid, @Nullable Charset charset) {
      this.valid = valid;
      this.charset = charset;
    }

    public static Result newValid(Charset charset) {
      return new Result(Validation.YES, charset);
    }

    public Validation valid() {
      return valid;
    }

    /**
     * Only non-null if Valid.Yes
     */
    @CheckForNull
    public Charset charset() {
      return charset;
    }
  }
}