CharsetValidationTest.java example

Explorer
sonarqube-master
/*
 * SonarQube
 * Copyright (C) 2009-2017 SonarSource SA
 * mailto:info AT sonarsource DOT com
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 3 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */
package org.sonar.scanner.scan.filesystem;

import static org.assertj.core.api.Assertions.assertThat;

import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.junit.Before;
import org.junit.Test;
import org.sonar.scanner.scan.filesystem.CharsetValidation.Validation;

public class CharsetValidationTest {
  private CharsetValidation charsets;

  @Before
  public void setUp() {
    charsets = new CharsetValidation();
  }

  @Test
  public void testWithSourceCode() throws IOException, URISyntaxException {
    Path path = Paths.get(this.getClass().getClassLoader().getResource("mediumtest/xoo/sample/xources/hello/HelloJava.xoo").toURI());
    List<String> lines = Files.readAllLines(path, StandardCharsets.UTF_8);
    String text = lines.stream().collect(StringBuffer::new, StringBuffer::append, StringBuffer::append).toString();

    byte[] utf8 = encode(text, StandardCharsets.UTF_8);
    byte[] utf16be = encode(text, StandardCharsets.UTF_16BE);
    byte[] utf16le = encode(text, StandardCharsets.UTF_16LE);

    assertThat(charsets.isUTF8(utf8, true).charset()).isEqualTo(StandardCharsets.UTF_8);
    assertThat(charsets.isUTF16(utf16be, true).charset()).isEqualTo(StandardCharsets.UTF_16BE);
    assertThat(charsets.isUTF16(utf16le, true).charset()).isEqualTo(StandardCharsets.UTF_16LE);

    assertThat(charsets.isValidUTF16(utf16be, false)).isTrue();
    assertThat(charsets.isValidUTF16(utf16le, true)).isTrue();
  }

  @Test
  public void detectUTF16NewLine() throws CharacterCodingException {
    // the first char will be encoded with a null on the second byte, but we should still detect it due to the new line
    String text = "\uA100" + "\uA212" + "\n";

    byte[] utf16be = encode(text, StandardCharsets.UTF_16BE);
    byte[] utf16le = encode(text, StandardCharsets.UTF_16LE);
    byte[] utf8 = encode(text, StandardCharsets.UTF_8);
    byte[] utf32 = encode(text, Charset.forName("UTF-32LE"));

    System.out.println(Arrays.toString(utf32));

    assertThat(charsets.isUTF16(utf16le, true).charset()).isEqualTo(StandardCharsets.UTF_16LE);
    assertThat(charsets.isUTF16(utf16be, true).charset()).isEqualTo(StandardCharsets.UTF_16BE);
    assertThat(charsets.isUTF16(utf8, true).valid()).isEqualTo(Validation.MAYBE);
    // this will have a double null, so it will be yes or no based on failOnNull
    assertThat(charsets.isUTF16(utf32, true).valid()).isEqualTo(Validation.NO);
    assertThat(charsets.isUTF16(utf32, false).valid()).isEqualTo(Validation.YES);
  }

  @Test
  public void detectUTF16Ascii() throws CharacterCodingException {
    String text = "some text to test";
    byte[] utf16be = encode(text, StandardCharsets.UTF_16BE);
    byte[] utf16le = encode(text, StandardCharsets.UTF_16LE);
    byte[] utf8 = encode(text, StandardCharsets.UTF_8);
    byte[] iso88591 = encode(text, StandardCharsets.ISO_8859_1);
    byte[] utf32 = encode(text, Charset.forName("UTF-32LE"));

    assertThat(charsets.isUTF16(utf16le, true).charset()).isEqualTo(StandardCharsets.UTF_16LE);
    assertThat(charsets.isUTF16(utf16be, true).charset()).isEqualTo(StandardCharsets.UTF_16BE);
    // not enough nulls -> we don't know
    assertThat(charsets.isUTF16(iso88591, true).valid()).isEqualTo(Validation.MAYBE);
    assertThat(charsets.isUTF16(utf8, true).valid()).isEqualTo(Validation.MAYBE);
    // fail based on double nulls
    assertThat(charsets.isUTF16(utf32, true).valid()).isEqualTo(Validation.NO);
  }

  @Test
  public void validUTF8() {
    // UTF8 with 3 bytes
    byte[] b = hexToByte("E2 80 A6");
    assertThat(charsets.isUTF8(b, true).valid()).isEqualTo(Validation.YES);
  }

  @Test
  public void invalidUTF16() {
    // UTF-16 will accept anything in direct 2 byte block unless it's between D800-DFFF (high and low surrogates).
    // In that case, it's a 4 byte encoding it's not a direct encoding.
    byte[] b1 = hexToByte("D800 0000");
    assertThat(charsets.isValidUTF16(b1)).isFalse();

    byte[] b1le = hexToByte("0000 D800");
    assertThat(charsets.isValidUTF16(b1le, true)).isFalse();

    // not enough bytes (any byte following this one would make it valid)
    byte[] b2 = {(byte) 0x01};
    assertThat(charsets.isValidUTF16(b2)).isFalse();

    // we reject double 0
    byte[] b3 = {(byte) 0, (byte) 0};
    assertThat(charsets.isValidUTF16(b3)).isFalse();
  }

  @Test
  public void invalidUTF8() {
    // never expects to see 0xFF or 0xC0..
    byte[] b1 = {(byte) 0xFF};
    assertThat(charsets.isUTF8(b1, true).valid()).isEqualTo(Validation.NO);

    byte[] b1c = {(byte) 0xC0};
    assertThat(charsets.isUTF8(b1c, true).valid()).isEqualTo(Validation.NO);

    // the first byte indicates a 2-byte encoding, but second byte is not valid
    byte[] b2 = {(byte) 0b11000010, (byte) 0b11000000};
    assertThat(charsets.isUTF8(b2, true).valid()).isEqualTo(Validation.NO);

    // we reject nulls (mainly to reject UTF-16)
    byte[] b3 = {(byte) 0};
    assertThat(charsets.isUTF8(b3, true).valid()).isEqualTo(Validation.NO);
  }

  @Test
  public void dontFailIfNotEnoughBytes() {
    byte[] b1 = hexToByte("D800");
    assertThat(charsets.isValidUTF16(b1)).isTrue();

    // the first byte indicates a 2-byte encoding, but there is no second byte
    byte[] b2 = {(byte) 0b11000010};
    assertThat(charsets.isUTF8(b2, true).valid()).isEqualTo(Validation.MAYBE);
  }

  private byte[] encode(String txt, Charset charset) throws CharacterCodingException {
    CharsetEncoder encoder = charset.newEncoder()
      .onMalformedInput(CodingErrorAction.REPORT)
      .onUnmappableCharacter(CodingErrorAction.REPORT);
    ByteBuffer encoded = encoder.encode(CharBuffer.wrap(txt));
    byte[] b = new byte[encoded.remaining()];
    encoded.get(b);
    return b;
  }

  private static byte[] hexToByte(String str) {
    String s = StringUtils.deleteWhitespace(str);
    int len = s.length();
    byte[] data = new byte[len / 2];
    for (int i = 0; i < len; i += 2) {
      data[i / 2] = (byte) ((Character.digit(s.charAt(i), 16) << 4)
        + Character.digit(s.charAt(i + 1), 16));
    }
    return data;
  }

}