package com.opslab.util; import net.sourceforge.pinyin4j.PinyinHelper; import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType; import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat; import net.sourceforge.pinyin4j.format.HanyuPinyinToneType; import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType; import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * 一些中文相关的操作方法 */ public final class ChinesUtil { private ChinesUtil(){ } /** * 将字符串中的中文转化为拼音,其他字符不变 * * @param inputString * @return */ public final static String getPingYin(String inputString) { HanyuPinyinOutputFormat format = new HanyuPinyinOutputFormat(); format.setCaseType(HanyuPinyinCaseType.LOWERCASE); format.setToneType(HanyuPinyinToneType.WITHOUT_TONE); format.setVCharType(HanyuPinyinVCharType.WITH_V); char[] input = inputString.trim().toCharArray(); String output = ""; try { for (int i = 0; i < input.length; i++) { if (java.lang.Character.toString(input[i]).matches("[\\u4E00-\\u9FA5]+")) { String[] temp = PinyinHelper.toHanyuPinyinStringArray(input[i], format); output += temp[0]; } else output += java.lang.Character.toString(input[i]); } } catch (BadHanyuPinyinOutputFormatCombination e) { e.printStackTrace(); } return output; } /** * 获取汉字串拼音首字母,英文字符不变 * * @param chinese 汉字串 * @return 汉语拼音首字母 */ public final static String getFirstSpell(String chinese) { StringBuffer pybf = new StringBuffer(); char[] arr = chinese.toCharArray(); HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat(); defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE); defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE); for (int i = 0; i < arr.length; i++) { if (arr[i] > 128) { try { String[] temp = PinyinHelper.toHanyuPinyinStringArray(arr[i], defaultFormat); if (temp != null) { pybf.append(temp[0].charAt(0)); } } catch (BadHanyuPinyinOutputFormatCombination e) { e.printStackTrace(); } } else { pybf.append(arr[i]); } } return pybf.toString().replaceAll("\\W", "").trim(); } /** * 获取汉字串拼音,英文字符不变 * * @param chinese 汉字串 * @return 汉语拼音 */ public final static String getFullSpell(String chinese) { StringBuffer pybf = new StringBuffer(); char[] arr = chinese.toCharArray(); HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat(); defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE); defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE); for (int i = 0; i < arr.length; i++) { if (arr[i] > 128) { try { pybf.append(PinyinHelper.toHanyuPinyinStringArray(arr[i], defaultFormat)[0]); } catch (BadHanyuPinyinOutputFormatCombination e) { e.printStackTrace(); } } else { pybf.append(arr[i]); } } return pybf.toString(); } // 只能判断部分CJK字符(CJK统一汉字) public final static boolean isChineseByREG(String str) { if (str == null) { return false; } Pattern pattern = Pattern.compile("[\\u4E00-\\u9FBF]+"); return pattern.matcher(str.trim()).find(); } // 只能判断部分CJK字符(CJK统一汉字) public final static boolean isChineseByName(String str) { if (str == null) { return false; } // 大小写不同:\\p 表示包含,\\P 表示不包含 // \\p{Cn} 的意思为 Unicode 中未被定义字符的编码,\\P{Cn} 就表示 Unicode中已经被定义字符的编码 String reg = "\\p{InCJK Unified Ideographs}&&\\P{Cn}"; Pattern pattern = Pattern.compile(reg); return pattern.matcher(str.trim()).find(); } // 完整的判断中文汉字和符号 public final static boolean isChinese(String strName) { char[] ch = strName.toCharArray(); for (int i = 0; i < ch.length; i++) { char c = ch[i]; if (isChinese(c)) { return true; } } return false; } /** * 判断是否是中文 * * @param c * @return */ public final static boolean isChinese(char c) { Character.UnicodeBlock ub = Character.UnicodeBlock.of(c); if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A || ub == Character.UnicodeBlock.GENERAL_PUNCTUATION || ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) { return true; } return false; } /** * 获取一个字符串中中文字符的个数 */ public final static int ChineseLength(String str) { Pattern p = Pattern.compile("[\u4E00-\u9FA5]+"); Matcher m = p.matcher(str); int i = 0; while (m.find()) { String temp = m.group(0); i += temp.length(); } return i; } /** * 判断是否是乱码 * * @param strName * @return */ public final static boolean isMessyCode(String strName) { Pattern p = Pattern.compile("\\s*|\t*|\r*|\n*"); Matcher m = p.matcher(strName); String after = m.replaceAll(""); String temp = after.replaceAll("\\p{P}", ""); char[] ch = temp.trim().toCharArray(); float chLength = 0; float count = 0; for (int i = 0; i < ch.length; i++) { char c = ch[i]; if (!Character.isLetterOrDigit(c)) { if (!ChinesUtil.isChinese(c)) { count = count + 1; } chLength++; } } float result = count / chLength; if (result > 0.4) { return true; } else { return false; } } }