/* * Copyright (c) 2003, PostgreSQL Global Development Group * See the LICENSE file in the project root for more information. */ package org.postgresql.core; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Reader; import java.io.Writer; import java.nio.charset.Charset; import java.util.HashMap; import java.util.logging.Level; import java.util.logging.Logger; /** * Representation of a particular character encoding. */ public class Encoding { private static final Logger LOGGER = Logger.getLogger(Encoding.class.getName()); private static final Encoding DEFAULT_ENCODING = new Encoding(); private static final Encoding UTF8_ENCODING = new Encoding("UTF-8"); /* * Preferred JVM encodings for backend encodings. */ private static final HashMap<String, String[]> encodings = new HashMap<String, String[]>(); static { //Note: this list should match the set of supported server // encodings found in backend/util/mb/encnames.c encodings.put("SQL_ASCII", new String[]{"ASCII", "US-ASCII"}); encodings.put("UNICODE", new String[]{"UTF-8", "UTF8"}); encodings.put("UTF8", new String[]{"UTF-8", "UTF8"}); encodings.put("LATIN1", new String[]{"ISO8859_1"}); encodings.put("LATIN2", new String[]{"ISO8859_2"}); encodings.put("LATIN3", new String[]{"ISO8859_3"}); encodings.put("LATIN4", new String[]{"ISO8859_4"}); encodings.put("ISO_8859_5", new String[]{"ISO8859_5"}); encodings.put("ISO_8859_6", new String[]{"ISO8859_6"}); encodings.put("ISO_8859_7", new String[]{"ISO8859_7"}); encodings.put("ISO_8859_8", new String[]{"ISO8859_8"}); encodings.put("LATIN5", new String[]{"ISO8859_9"}); encodings.put("LATIN7", new String[]{"ISO8859_13"}); encodings.put("LATIN9", new String[]{"ISO8859_15_FDIS"}); encodings.put("EUC_JP", new String[]{"EUC_JP"}); encodings.put("EUC_CN", new String[]{"EUC_CN"}); encodings.put("EUC_KR", new String[]{"EUC_KR"}); encodings.put("JOHAB", new String[]{"Johab"}); encodings.put("EUC_TW", new String[]{"EUC_TW"}); encodings.put("SJIS", new String[]{"MS932", "SJIS"}); encodings.put("BIG5", new String[]{"Big5", "MS950", "Cp950"}); encodings.put("GBK", new String[]{"GBK", "MS936"}); encodings.put("UHC", new String[]{"MS949", "Cp949", "Cp949C"}); encodings.put("TCVN", new String[]{"Cp1258"}); encodings.put("WIN1256", new String[]{"Cp1256"}); encodings.put("WIN1250", new String[]{"Cp1250"}); encodings.put("WIN874", new String[]{"MS874", "Cp874"}); encodings.put("WIN", new String[]{"Cp1251"}); encodings.put("ALT", new String[]{"Cp866"}); // We prefer KOI8-U, since it is a superset of KOI8-R. encodings.put("KOI8", new String[]{"KOI8_U", "KOI8_R"}); // If the database isn't encoding-aware then we can't have // any preferred encodings. encodings.put("UNKNOWN", new String[0]); // The following encodings do not have a java equivalent encodings.put("MULE_INTERNAL", new String[0]); encodings.put("LATIN6", new String[0]); encodings.put("LATIN8", new String[0]); encodings.put("LATIN10", new String[0]); } private final String encoding; private final boolean fastASCIINumbers; /** * Uses the default charset of the JVM. */ private Encoding() { this(Charset.defaultCharset().name()); } /** * Use the charset passed as parameter. * * @param encoding charset name to use */ protected Encoding(String encoding) { if (encoding == null) { throw new NullPointerException("Null encoding charset not supported"); } this.encoding = encoding; fastASCIINumbers = testAsciiNumbers(); if (LOGGER.isLoggable(Level.FINEST)) { LOGGER.log(Level.FINEST, "Creating new Encoding {0} with fastASCIINumbers {1}", new Object[]{encoding, fastASCIINumbers}); } } /** * Returns true if this encoding has characters '-' and '0'..'9' in exactly same posision as * ascii. * * @return true if the bytes can be scanned directly for ascii numbers. */ public boolean hasAsciiNumbers() { return fastASCIINumbers; } /** * Construct an Encoding for a given JVM encoding. * * @param jvmEncoding the name of the JVM encoding * @return an Encoding instance for the specified encoding, or an Encoding instance for the * default JVM encoding if the specified encoding is unavailable. */ public static Encoding getJVMEncoding(String jvmEncoding) { if ("UTF-8".equals(jvmEncoding)) { return new UTF8Encoding(jvmEncoding); } if (Charset.isSupported(jvmEncoding)) { return new Encoding(jvmEncoding); } else { return DEFAULT_ENCODING; } } /** * Construct an Encoding for a given database encoding. * * @param databaseEncoding the name of the database encoding * @return an Encoding instance for the specified encoding, or an Encoding instance for the * default JVM encoding if the specified encoding is unavailable. */ public static Encoding getDatabaseEncoding(String databaseEncoding) { if ("UTF8".equals(databaseEncoding)) { return UTF8_ENCODING; } // If the backend encoding is known and there is a suitable // encoding in the JVM we use that. Otherwise we fall back // to the default encoding of the JVM. String[] candidates = encodings.get(databaseEncoding); if (candidates != null) { for (String candidate : candidates) { LOGGER.log(Level.FINEST, "Search encoding candidate {0}", candidate); if (Charset.isSupported(candidate)) { return new Encoding(candidate); } } } // Try the encoding name directly -- maybe the charset has been // provided by the user. if (Charset.isSupported(databaseEncoding)) { return new Encoding(databaseEncoding); } // Fall back to default JVM encoding. LOGGER.log(Level.FINEST, "{0} encoding not found, returning default encoding", databaseEncoding); return DEFAULT_ENCODING; } /** * Get the name of the (JVM) encoding used. * * @return the JVM encoding name used by this instance. */ public String name() { return Charset.isSupported(encoding) ? Charset.forName(encoding).name() : encoding; } /** * Encode a string to an array of bytes. * * @param s the string to encode * @return a bytearray containing the encoded string * @throws IOException if something goes wrong */ public byte[] encode(String s) throws IOException { if (s == null) { return null; } return s.getBytes(encoding); } /** * Decode an array of bytes into a string. * * @param encodedString a byte array containing the string to decode * @param offset the offset in <code>encodedString</code> of the first byte of the encoded * representation * @param length the length, in bytes, of the encoded representation * @return the decoded string * @throws IOException if something goes wrong */ public String decode(byte[] encodedString, int offset, int length) throws IOException { return new String(encodedString, offset, length, encoding); } /** * Decode an array of bytes into a string. * * @param encodedString a byte array containing the string to decode * @return the decoded string * @throws IOException if something goes wrong */ public String decode(byte[] encodedString) throws IOException { return decode(encodedString, 0, encodedString.length); } /** * Get a Reader that decodes the given InputStream using this encoding. * * @param in the underlying stream to decode from * @return a non-null Reader implementation. * @throws IOException if something goes wrong */ public Reader getDecodingReader(InputStream in) throws IOException { return new InputStreamReader(in, encoding); } /** * Get a Writer that encodes to the given OutputStream using this encoding. * * @param out the underlying stream to encode to * @return a non-null Writer implementation. * @throws IOException if something goes wrong */ public Writer getEncodingWriter(OutputStream out) throws IOException { return new OutputStreamWriter(out, encoding); } /** * Get an Encoding using the default encoding for the JVM. * * @return an Encoding instance */ public static Encoding defaultEncoding() { return DEFAULT_ENCODING; } public String toString() { return encoding; } /** * Checks weather this encoding is compatible with ASCII for the number characters '-' and * '0'..'9'. Where compatible means that they are encoded with exactly same values. * * @return If faster ASCII number parsing can be used with this encoding. */ private boolean testAsciiNumbers() { // TODO: test all postgres supported encoding to see if there are // any which do _not_ have ascii numbers in same location // at least all the encoding listed in the encodings hashmap have // working ascii numbers try { String test = "-0123456789"; byte[] bytes = encode(test); String res = new String(bytes, "US-ASCII"); return test.equals(res); } catch (java.io.UnsupportedEncodingException e) { return false; } catch (IOException e) { return false; } } }