/*
* SonarQube
* Copyright (C) 2009-2017 SonarSource SA
* mailto:info AT sonarsource DOT com
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
package org.sonar.scanner.scan.filesystem;
import static java.nio.charset.StandardCharsets.UTF_16;
import static java.nio.charset.StandardCharsets.UTF_16LE;
import static java.nio.charset.StandardCharsets.UTF_16BE;
import static java.nio.charset.StandardCharsets.UTF_8;
import java.nio.charset.Charset;
import java.util.Arrays;
import javax.annotation.CheckForNull;
import org.apache.commons.io.ByteOrderMark;
import org.sonar.scanner.scan.filesystem.CharsetValidation.Result;
import org.sonar.scanner.scan.filesystem.CharsetValidation.Validation;
public class ByteCharsetDetector {
// these needs to be sorted by longer first!
private static final ByteOrderMark[] boms = {ByteOrderMark.UTF_8, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE,
ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE};
private Charset userConfiguration;
private CharsetValidation validator;
public ByteCharsetDetector(CharsetValidation validator, Charset userConfiguration) {
this.validator = validator;
this.userConfiguration = userConfiguration;
}
@CheckForNull
public Charset detect(byte[] buf) {
// Try UTF-8 first since we are very confident in it if it's a yes.
// Fail if we see nulls to not have FPs if the text is ASCII encoded in UTF-16.
Result utf8Result = validator.isUTF8(buf, true);
if (utf8Result.valid() == Validation.YES) {
return utf8Result.charset();
} else if (utf8Result.valid() == Validation.MAYBE) {
return detectAscii(buf);
}
// try UTF16 with both endiness. Fail if we see nulls to not have FPs if it's UTF-32.
Result utf16 = validator.isUTF16(buf, true);
if (utf16.valid() == Validation.YES && validator.isValidUTF16(buf, UTF_16LE.equals(utf16.charset()))) {
return utf16.charset();
}
// at this point we know it can't be UTF-8
Charset c = userConfiguration;
if (!UTF_8.equals(c) && (!isUtf16(c) || utf16.valid() == Validation.MAYBE) && validator.tryDecode(buf, c)) {
return c;
}
return null;
}
private Charset detectAscii(byte[] buf) {
if (!isUtf16Or32(userConfiguration) && validator.tryDecode(buf, userConfiguration)) {
return userConfiguration;
}
return null;
}
private static boolean isUtf16(Charset charset) {
return UTF_16.equals(charset) || UTF_16BE.equals(charset) || UTF_16LE.equals(charset);
}
private static boolean isUtf16Or32(Charset charset) {
return isUtf16(charset) || MetadataGenerator.UTF_32BE.equals(charset) || MetadataGenerator.UTF_32LE.equals(charset);
}
@CheckForNull
public ByteOrderMark detectBOM(byte[] buffer) {
return Arrays.stream(boms)
.filter(b -> isBom(b, buffer))
.findAny()
.orElse(null);
}
private static boolean isBom(ByteOrderMark bom, byte[] buffer) {
if (buffer.length < bom.length()) {
return false;
}
for (int i = 0; i < bom.length(); i++) {
if ((byte) bom.get(i) != buffer[i]) {
return false;
}
}
return true;
}
}