package org.juxtasoftware.util; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.Reader; import java.io.Writer; import org.apache.commons.io.IOUtils; import org.juxtasoftware.Constants; import org.mozilla.intl.chardet.nsDetector; import org.mozilla.intl.chardet.nsICharsetDetectionObserver; import org.mozilla.universalchardet.UniversalDetector; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public final class EncodingUtils { private static final Logger LOG = LoggerFactory.getLogger( Constants.WS_LOGGER_NAME ); public static final File stripUnknownUTF8(File srcFile) throws IOException { File fixed = null; BufferedReader r = null; OutputStreamWriter osw = null; try { fixed = File.createTempFile("txt", "dat"); osw = new OutputStreamWriter(new FileOutputStream(fixed), "UTF-8"); FileInputStream fis = new FileInputStream(srcFile); InputStreamReader isr = new InputStreamReader(fis, "UTF-8"); r = new BufferedReader(isr); while (true) { String line = r.readLine(); if (line == null) { break; } else { char bad = 0xfffd; line = line.replaceAll("" + bad, " "); osw.write(line + "\n"); } } return fixed; } finally { IOUtils.closeQuietly(r); IOUtils.closeQuietly(osw); } } /** * Normalize the content to UTF-8 and strip any tags that say otherwise * @return A file containing the UTF-8 contents * @throws IOException */ public static File fixEncoding( InputStream source ) throws IOException { File tmpSrc = File.createTempFile("src", "dat"); FileOutputStream fos = new FileOutputStream(tmpSrc); IOUtils.copyLarge(source, fos); IOUtils.closeQuietly(fos); String encoding = EncodingUtils.detectEncoding(tmpSrc); if ( encoding.equalsIgnoreCase("UTF-8") ) { EncodingUtils.finalFixes(tmpSrc); return tmpSrc; } LOG.info("Converting from "+encoding+" to UTF-8"); // stream the input in original encoding to output in UTF-8 File utf8Out = File.createTempFile("utf8out","dat"); Reader in = null; if ( encoding == "UNK" ) { in = new InputStreamReader(new FileInputStream(tmpSrc), "UTF-8" ); // default to a UTF-8 } else { in = new InputStreamReader(new FileInputStream(tmpSrc), encoding); } Writer out = new OutputStreamWriter(new FileOutputStream(utf8Out), "UTF-8"); int c; while ((c = in.read()) != -1){ out.write(c); } IOUtils.closeQuietly(in); IOUtils.closeQuietly(out); if (!tmpSrc.delete() ) { tmpSrc.deleteOnExit(); } // lastly, strip the xml declaration and repair ^M linefeeds. EncodingUtils.finalFixes(utf8Out); return utf8Out; } private static void finalFixes(File tmpSrc) throws IOException { File out=null; try { FileInputStream fis = new FileInputStream(tmpSrc); InputStreamReader isr = new InputStreamReader(fis, "UTF-8"); BufferedReader r = new BufferedReader( isr ); out = File.createTempFile("fix", "dat"); FileOutputStream fos = new FileOutputStream(out); OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8"); boolean pastHeader = false; while (true) { String line = r.readLine(); if ( line != null ) { if ( pastHeader == false && line.contains("<?xml") ) { pastHeader = true; int pos = line.indexOf("<?xml"); int end = line.indexOf("?>", pos); line = line.substring(0,pos)+line.substring(end+2); } line += "\n"; osw.write(line); } else { break; } } IOUtils.closeQuietly( r ); IOUtils.closeQuietly(osw); IOUtils.copy(new FileInputStream(out), new FileOutputStream(tmpSrc)); } finally { if (out != null ) { if (!out.delete() ) { out.deleteOnExit(); } } } } private static String detectEncoding(File srcFile) throws IOException { // feed chunks of data to the detector until it is done UniversalDetector detector = new UniversalDetector(null); byte[] buf = new byte[4096]; int nread; String encoding = "utf-8"; FileInputStream fis = null; try { fis = new FileInputStream(srcFile); while ((nread = fis.read(buf)) > 0 && !detector.isDone()) { detector.handleData(buf, 0, nread); } detector.dataEnd(); IOUtils.closeQuietly(fis); encoding = detector.getDetectedCharset(); if ( encoding != null ){ return encoding; } // above failed, try a different encoding detector encoding = EncodingUtils.alternateEncodeDetect(srcFile); if ( encoding != null ){ return encoding; } // all else fails, just look for an encoding declaration in the src encoding = EncodingUtils.scanFileForEncodingDeclaration(srcFile); if ( encoding == null ) { LOG.error("Unable to detect encoding"); encoding = "UNK"; } } catch (IOException e ) { LOG.error("Encoding detection failed", e); encoding = "UNK"; } finally { IOUtils.closeQuietly(fis); } return encoding; } private static String scanFileForEncodingDeclaration(File srcFile) throws IOException { BufferedReader r = new BufferedReader( new FileReader(srcFile )); boolean foundHeader = false; String encoding = null; while (true) { String line = r.readLine(); if ( line != null ) { if ( foundHeader == false && line.contains("<?xml") ) { foundHeader = true; int pos = line.indexOf("<?xml"); int encPos = line.indexOf("encoding=", pos); if (encPos > -1 ) { line = line.replaceAll("\"", "'"); int end = line.indexOf("'", encPos+10); encoding = line.substring(encPos+10, end); break; } else { int end = line.indexOf("?>", pos); if ( end > -1 ) { break; } } } else if (foundHeader == true ) { int encPos = line.indexOf("encoding="); if (encPos > -1 ) { line = line.replaceAll("\"", "'"); int end = line.indexOf("'", encPos+10); encoding = line.substring(encPos+10, end); break; } else { int end = line.indexOf("?>"); if ( end > -1 ) { break; } } } } else { break; } } IOUtils.closeQuietly(r); return encoding; } private static String alternateEncodeDetect(File testFile) throws IOException { nsDetector det = new nsDetector(); DetectListener listener = new DetectListener(); det.Init( listener ); BufferedInputStream imp = new BufferedInputStream(new FileInputStream(testFile)); byte[] buf = new byte[1024]; int len; boolean done = false; boolean isAscii = true; while ((len = imp.read(buf, 0, buf.length)) != -1) { if (isAscii) { isAscii = det.isAscii(buf, len); } if (!isAscii && !done) { done = det.DoIt(buf, len, false); } } det.DataEnd(); imp.close(); return listener.getEncoding(); } private static class DetectListener implements nsICharsetDetectionObserver { private String encoding; public String getEncoding() { return this.encoding; } public void Notify(String charset) { this.encoding = charset; } } }