package uk.ac.ox.zoo.seeg.abraid.mp.common.util; import org.mozilla.universalchardet.UniversalDetector; import org.springframework.util.StringUtils; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; import java.nio.charset.UnsupportedCharsetException; import java.util.Arrays; /** * Contains utilities relating to character sets. * Copyright (c) 2014 University of Oxford */ public final class CharacterSetUtils { private CharacterSetUtils() { } /** * Detects the character set of the input text. * @param input The input text as a byte array. * @return The character set of the input text, or null if it cannot be detected. */ public static Charset detectCharacterSet(byte[] input) { if (input == null) { return null; } Charset charset = null; input = input.clone(); UniversalDetector detector = new UniversalDetector(null); detector.handleData(input, 0, input.length); detector.dataEnd(); String detectedCharset = detector.getDetectedCharset(); if (StringUtils.hasText(detectedCharset)) { try { charset = Charset.forName(detectedCharset); } catch (UnsupportedCharsetException e) { throw new RuntimeException("Detected unsupported character set " + detectedCharset); } } return charset; } /** * Converts the input text between the two specified character sets. * @param input The input text as a byte array. * @param fromCharset The source character set. * @param toCharSet The destination character set. * @return The input text, converted from the source to the destination character set. */ public static byte[] convertToCharacterSet(byte[] input, Charset fromCharset, Charset toCharSet) { if (input == null) { return null; } CharBuffer decodedData = fromCharset.decode(ByteBuffer.wrap(input)); ByteBuffer encodedData = toCharSet.encode(decodedData); return Arrays.copyOf(encodedData.array(), encodedData.limit()); } }