package de.dpa.oss.common; import de.dpa.oss.metadata.mapper.imaging.EncodingCharset; import java.nio.CharBuffer; import java.nio.charset.Charset; import java.nio.charset.CharsetEncoder; import java.util.Arrays; import java.util.Formatter; import java.util.HashMap; import java.util.Map; /** * <p>This class maps all characters of a given input string to an output string. The mapping is based on code points * allowing up to 4-byte unicodes.</p> * <p/> * <p>The class supports the restriction of the mapping to a certain character set. If such a character set is specified * ({@link StringCharacterMappingTableBuilder#restrictToCharsetUsingDefaultChar(EncodingCharset, String)}) then the fallback replacement character * hs to be specified too. This character is used in case an input character cannot be mapped or is malformed </p> * * @author oliver langer */ public class StringCharacterMappingTable implements StringCharacterMapping { /** * first simple approach: * <ul> * <li>use the code point of the character which should be mapped</li> * <li>if the element equals -1 at this index then the character will be returned unchanged. * Otherwise the code point will be used defined at this index</li> * </ul> */ private int codePointMapping[]; private static int NO_MAPPING_ENTRY = -1; private Charset targetCharset; /** * characters of this string will be used during mapping in case a character cannot be mapped * or is somehow malformed. */ private String targetCharsetMappingFallbackAppendString = "?"; public static StringCharacterMappingTableBuilder aCharacterMapping() { return new StringCharacterMappingTableBuilder(); } private StringCharacterMappingTable(final int[] codePointMapping, final Charset targetCharset, final String targetCharsetMappingFallbackString) { this.codePointMapping = codePointMapping; this.targetCharset = targetCharset; if (targetCharset != null) { this.targetCharsetMappingFallbackAppendString = targetCharsetMappingFallbackString; } } @Override public String map(final String inputString) { if (inputString == null || inputString.length() == 0) { return inputString; } final CharsetEncoder charsetEncoder; if (targetCharset != null) { charsetEncoder = targetCharset.newEncoder(); } else { charsetEncoder = Charset.defaultCharset().newEncoder(); } final StringBuilder utf16StringRepresentation = new StringBuilder(); CodepointIterator codepointIterator = CodepointIterator.iterate(inputString); while (codepointIterator.hasNext()) { int currentCodePoint = codepointIterator.next(); int mappedCodepointValue = codePointMapping[currentCodePoint]; if (mappedCodepointValue == NO_MAPPING_ENTRY) { if (targetCharset != null) { /** * try to map it to target charset. If not possible then use fallback mapping character */ CharBuffer utf16CharBuffer = CharBuffer.wrap(Character.toChars(currentCodePoint)); if (charsetEncoder.canEncode(utf16CharBuffer)) { utf16StringRepresentation.append(utf16CharBuffer); } else { utf16StringRepresentation.append(targetCharsetMappingFallbackAppendString); } } else { CharBuffer utf16CharBuffer = CharBuffer.wrap(Character.toChars(currentCodePoint)); if (charsetEncoder.canEncode(utf16CharBuffer)) { utf16StringRepresentation.append(utf16CharBuffer); } else { utf16StringRepresentation.append(targetCharsetMappingFallbackAppendString); } } } else { CharBuffer utf16CharBuffer = CharBuffer.wrap(Character.toChars(mappedCodepointValue)); if (charsetEncoder.canEncode(utf16CharBuffer)) { utf16StringRepresentation.append(utf16CharBuffer); } else { utf16StringRepresentation.append(targetCharsetMappingFallbackAppendString); } } } return utf16StringRepresentation.toString(); } /** * Returns the mapping table as specially formatted string * * @param formatString corresponding to {@link Formatter} with the following arguments: * <ul> * <li> * 1: unicode value of the source character * </li> * <li> * 2: source character * </li> * <li> * 3: unicode value of the mapped character * </li> * <li> * 4: mapped character * </li> * </ul> * @param codepointToAlternativeOutput if the character of codepoint is not suitable for the output an alternative character/string * may be used given by this map. * @return table of mapping */ public String toString(final String formatString, final Map<Integer,String> codepointToAlternativeOutput ) { StringBuilder sb = new StringBuilder(); final Formatter formatter = new Formatter(sb); for (int i = 0; i < Character.MAX_CODE_POINT; i++) { int mapTo = codePointMapping[i]; if (mapTo != NO_MAPPING_ENTRY) { final String sourceString; if( codepointToAlternativeOutput.containsKey(i)) { sourceString = codepointToAlternativeOutput.get(i); } else { sourceString = new String(Character.toChars(i)); } final String targetString; if(codepointToAlternativeOutput.containsKey(mapTo)) { targetString = codepointToAlternativeOutput.get(mapTo); } else { targetString = new String(Character.toChars(mapTo)); } formatter.format(formatString, Integer.toHexString(i), sourceString, Integer.toHexString(mapTo), targetString); } } return sb.toString(); } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("<characterMapping>"); HashMap<Integer, String> codepointToAlternativeOutput = new HashMap<>(); codepointToAlternativeOutput.put( "\"".codePointAt(0), """); sb.append(toString("<character from=\"0x%1s\" to=\"ox%3s\" comment=\"from %2s; to%4s\"/>", codepointToAlternativeOutput)); sb.append( "</characterMapping>"); return sb.toString(); } public static class StringCharacterMappingTableBuilder { private int codePointMapping[]; private Charset targetCharset = null; private String targetCharsetMappingFallbackCharacter = null; public StringCharacterMappingTableBuilder() { codePointMapping = new int[Character.MAX_CODE_POINT]; Arrays.fill(codePointMapping, NO_MAPPING_ENTRY); } public StringCharacterMappingTableBuilder addCodepointMapping(final String fromHex16BitCodepoint, final String toHex16BitCodepoint) { int fromCodepoint = hexToInt(fromHex16BitCodepoint); int toCodepoint = hexToInt(toHex16BitCodepoint); addCodepointMapping(fromCodepoint, toCodepoint); return this; } public StringCharacterMappingTableBuilder addCodepointMapping(final int fromCodepoint, final int toCodePoint) { codePointMapping[fromCodepoint] = toCodePoint; return this; } /** * Iterates over the source and target string simultaneously. For each code point of the source * string it adds a mapping to the corresponding code point of the target string. It is a 1:1 mapping * and therefor expects equal numbers of code points for source and target string. */ public StringCharacterMappingTableBuilder addMultiCharacterMapping(final String sourceCharacters, final String targetCharacters) { CodepointIterator cpSrc = CodepointIterator.iterate(sourceCharacters); CodepointIterator cpDest = CodepointIterator.iterate(targetCharacters); while (cpSrc.hasNext()) { addCodepointMapping(cpSrc.next(), cpDest.next()); } return this; } /** * * * @param charsetName name of the target charset * @param targetCharsetMappingFallbackCharacter during mapping of a character this string will be used instead of * the character in case it can not be encoded into the target charset encoding. */ public StringCharacterMappingTableBuilder restrictToCharsetUsingDefaultChar(final EncodingCharset charsetName, String targetCharsetMappingFallbackCharacter) { if (targetCharsetMappingFallbackCharacter == null) { throw new IllegalArgumentException("fallback mapping character must not be null"); } targetCharset = Charset.forName(charsetName.charsetName()); this.targetCharsetMappingFallbackCharacter = targetCharsetMappingFallbackCharacter; return this; } public StringCharacterMappingTable build() { return new StringCharacterMappingTable(codePointMapping, targetCharset, targetCharsetMappingFallbackCharacter); } } public static int hexToInt(final String hexStr) { if (hexStr == null || hexStr.length() == 0) { throw new IllegalArgumentException(); } int posX = hexStr.indexOf('x'); final String stringToParse; if (posX > -1) { stringToParse = hexStr.substring(posX + 1); } else { stringToParse = hexStr; } return Integer.parseInt(stringToParse, 16); } }