package com.opslab.util.algorithmImpl; import org.mozilla.intl.chardet.nsDetector; import org.mozilla.intl.chardet.nsICharsetDetectionObserver; import java.io.*; /** * 文件相关的算法实现 */ public class FileImpl { /** * ************************************************** * 以下方式利用mozilla的jchardet作为探测工具 */ private static boolean found = false; /** * 如果完全匹配某个字符集检测算法, 则该属性保存该字符集的名称. 否则(如二进制文件)其值就为默认值 null, 这时应当查询属性 */ private static String encoding = null; /** * 利用文件头特征判断文件的编码方式 * * @param fileName 需要处理的文件 * @return 返回文件编码 */ public static String simpleEncoding(String fileName) { int p = 0; try ( BufferedInputStream bin = new BufferedInputStream(new FileInputStream(fileName)); ) { p = (bin.read() << 8) + bin.read(); } catch (IOException e) { e.printStackTrace(); } String code = null; switch (p) { case 0xefbb: code = "UTF-8"; break; case 0xfffe: code = "Unicode"; break; case 0xfeff: code = "UTF-16BE"; break; default: code = "GBK"; } return code; } /** * 传入一个文件(File)对象,检查文件编码 * * @param file File对象实例 * @return 文件编码,若无,则返回null * @throws FileNotFoundException * @throws IOException */ public static String guestFileEncoding(File file) throws IOException { return geestFileEncoding(file, new nsDetector()); } /** * 获取文件的编码 * * @param file File对象实例 * @param languageHint 语言提示区域代码 eg:1 : Japanese; 2 : Chinese; 3 : Simplified Chinese; * 4 : Traditional Chinese; 5 : Korean; 6 : Dont know (default) * @return 文件编码,eg:UTF-8,GBK,GB2312形式,若无,则返回null * @throws FileNotFoundException * @throws IOException */ public static String guestFileEncoding(File file, int languageHint) throws IOException { return geestFileEncoding(file, new nsDetector(languageHint)); } /** * 获取文件的编码 * * @param path 文件路径 * @return 文件编码,eg:UTF-8,GBK,GB2312形式,若无,则返回null * @throws FileNotFoundException * @throws IOException */ public static String guestFileEncoding(String path) throws IOException { return guestFileEncoding(new File(path)); } /** * 获取文件的编码 * * @param path 文件路径 * @param languageHint 语言提示区域代码 eg:1 : Japanese; 2 : Chinese; 3 : Simplified Chinese; * 4 : Traditional Chinese; 5 : Korean; 6 : Dont know (default) * @return 返回文件的编码 * @throws FileNotFoundException * @throws IOException */ public static String guestFileEncoding(String path, int languageHint) throws FileNotFoundException, IOException { return guestFileEncoding(new File(path), languageHint); } /** * 获取文件的编码 * * @param file 需要处理文件的编码 * @param det nsDetector * @return 返回文件编码 * @throws FileNotFoundException * @throws IOException */ private static String geestFileEncoding(File file, nsDetector det) { det.Init(new nsICharsetDetectionObserver() { public void Notify(String charset) { found = true; encoding = charset; } }); byte[] buf = new byte[1024]; int len; boolean done = false; boolean isAscii = true; try ( BufferedInputStream imp = new BufferedInputStream(new FileInputStream(file)); ) { while ((len = imp.read(buf, 0, buf.length)) != -1) { // Check if the stream is only ascii. if (isAscii) { isAscii = det.isAscii(buf, len); } // DoIt if non-ascii and not done yet. if (!isAscii && !done) { done = det.DoIt(buf, len, false); } } det.DataEnd(); } catch (IOException e) { e.printStackTrace(); } if (isAscii) { encoding = "ASCII"; found = true; } if (!found) { String prob[] = det.getProbableCharsets(); if (prob.length > 0) { // 在没有发现情况下,则取第一个可能的编码 encoding = prob[0]; } else { return null; } } return encoding; } }