/*
* SonarQube
* Copyright (C) 2009-2017 SonarSource SA
* mailto:info AT sonarsource DOT com
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
package org.sonar.scanner.scan.filesystem;
import static org.assertj.core.api.Assertions.assertThat;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.junit.Before;
import org.junit.Test;
import org.sonar.scanner.scan.filesystem.CharsetValidation.Validation;
public class CharsetValidationTest {
private CharsetValidation charsets;
@Before
public void setUp() {
charsets = new CharsetValidation();
}
@Test
public void testWithSourceCode() throws IOException, URISyntaxException {
Path path = Paths.get(this.getClass().getClassLoader().getResource("mediumtest/xoo/sample/xources/hello/HelloJava.xoo").toURI());
List<String> lines = Files.readAllLines(path, StandardCharsets.UTF_8);
String text = lines.stream().collect(StringBuffer::new, StringBuffer::append, StringBuffer::append).toString();
byte[] utf8 = encode(text, StandardCharsets.UTF_8);
byte[] utf16be = encode(text, StandardCharsets.UTF_16BE);
byte[] utf16le = encode(text, StandardCharsets.UTF_16LE);
assertThat(charsets.isUTF8(utf8, true).charset()).isEqualTo(StandardCharsets.UTF_8);
assertThat(charsets.isUTF16(utf16be, true).charset()).isEqualTo(StandardCharsets.UTF_16BE);
assertThat(charsets.isUTF16(utf16le, true).charset()).isEqualTo(StandardCharsets.UTF_16LE);
assertThat(charsets.isValidUTF16(utf16be, false)).isTrue();
assertThat(charsets.isValidUTF16(utf16le, true)).isTrue();
}
@Test
public void detectUTF16NewLine() throws CharacterCodingException {
// the first char will be encoded with a null on the second byte, but we should still detect it due to the new line
String text = "\uA100" + "\uA212" + "\n";
byte[] utf16be = encode(text, StandardCharsets.UTF_16BE);
byte[] utf16le = encode(text, StandardCharsets.UTF_16LE);
byte[] utf8 = encode(text, StandardCharsets.UTF_8);
byte[] utf32 = encode(text, Charset.forName("UTF-32LE"));
System.out.println(Arrays.toString(utf32));
assertThat(charsets.isUTF16(utf16le, true).charset()).isEqualTo(StandardCharsets.UTF_16LE);
assertThat(charsets.isUTF16(utf16be, true).charset()).isEqualTo(StandardCharsets.UTF_16BE);
assertThat(charsets.isUTF16(utf8, true).valid()).isEqualTo(Validation.MAYBE);
// this will have a double null, so it will be yes or no based on failOnNull
assertThat(charsets.isUTF16(utf32, true).valid()).isEqualTo(Validation.NO);
assertThat(charsets.isUTF16(utf32, false).valid()).isEqualTo(Validation.YES);
}
@Test
public void detectUTF16Ascii() throws CharacterCodingException {
String text = "some text to test";
byte[] utf16be = encode(text, StandardCharsets.UTF_16BE);
byte[] utf16le = encode(text, StandardCharsets.UTF_16LE);
byte[] utf8 = encode(text, StandardCharsets.UTF_8);
byte[] iso88591 = encode(text, StandardCharsets.ISO_8859_1);
byte[] utf32 = encode(text, Charset.forName("UTF-32LE"));
assertThat(charsets.isUTF16(utf16le, true).charset()).isEqualTo(StandardCharsets.UTF_16LE);
assertThat(charsets.isUTF16(utf16be, true).charset()).isEqualTo(StandardCharsets.UTF_16BE);
// not enough nulls -> we don't know
assertThat(charsets.isUTF16(iso88591, true).valid()).isEqualTo(Validation.MAYBE);
assertThat(charsets.isUTF16(utf8, true).valid()).isEqualTo(Validation.MAYBE);
// fail based on double nulls
assertThat(charsets.isUTF16(utf32, true).valid()).isEqualTo(Validation.NO);
}
@Test
public void validUTF8() {
// UTF8 with 3 bytes
byte[] b = hexToByte("E2 80 A6");
assertThat(charsets.isUTF8(b, true).valid()).isEqualTo(Validation.YES);
}
@Test
public void invalidUTF16() {
// UTF-16 will accept anything in direct 2 byte block unless it's between D800-DFFF (high and low surrogates).
// In that case, it's a 4 byte encoding it's not a direct encoding.
byte[] b1 = hexToByte("D800 0000");
assertThat(charsets.isValidUTF16(b1)).isFalse();
byte[] b1le = hexToByte("0000 D800");
assertThat(charsets.isValidUTF16(b1le, true)).isFalse();
// not enough bytes (any byte following this one would make it valid)
byte[] b2 = {(byte) 0x01};
assertThat(charsets.isValidUTF16(b2)).isFalse();
// we reject double 0
byte[] b3 = {(byte) 0, (byte) 0};
assertThat(charsets.isValidUTF16(b3)).isFalse();
}
@Test
public void invalidUTF8() {
// never expects to see 0xFF or 0xC0..
byte[] b1 = {(byte) 0xFF};
assertThat(charsets.isUTF8(b1, true).valid()).isEqualTo(Validation.NO);
byte[] b1c = {(byte) 0xC0};
assertThat(charsets.isUTF8(b1c, true).valid()).isEqualTo(Validation.NO);
// the first byte indicates a 2-byte encoding, but second byte is not valid
byte[] b2 = {(byte) 0b11000010, (byte) 0b11000000};
assertThat(charsets.isUTF8(b2, true).valid()).isEqualTo(Validation.NO);
// we reject nulls (mainly to reject UTF-16)
byte[] b3 = {(byte) 0};
assertThat(charsets.isUTF8(b3, true).valid()).isEqualTo(Validation.NO);
}
@Test
public void dontFailIfNotEnoughBytes() {
byte[] b1 = hexToByte("D800");
assertThat(charsets.isValidUTF16(b1)).isTrue();
// the first byte indicates a 2-byte encoding, but there is no second byte
byte[] b2 = {(byte) 0b11000010};
assertThat(charsets.isUTF8(b2, true).valid()).isEqualTo(Validation.MAYBE);
}
private byte[] encode(String txt, Charset charset) throws CharacterCodingException {
CharsetEncoder encoder = charset.newEncoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
ByteBuffer encoded = encoder.encode(CharBuffer.wrap(txt));
byte[] b = new byte[encoded.remaining()];
encoded.get(b);
return b;
}
private static byte[] hexToByte(String str) {
String s = StringUtils.deleteWhitespace(str);
int len = s.length();
byte[] data = new byte[len / 2];
for (int i = 0; i < len; i += 2) {
data[i / 2] = (byte) ((Character.digit(s.charAt(i), 16) << 4)
+ Character.digit(s.charAt(i + 1), 16));
}
return data;
}
}