/* * To change this template, choose Tools | Templates * and open the template in the editor. */ package postaggersalanguage.five; import java.io.BufferedReader; import java.io.DataInputStream; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.net.URL; import java.util.Collection; import static net.sf.hfst.HfstOptimizedLookup.runTransducer; import net.sf.hfst.FormatException; import net.sf.hfst.NoTokenizationException; import net.sf.hfst.Transducer; import net.sf.hfst.TransducerAlphabet; import net.sf.hfst.TransducerHeader; import net.sf.hfst.UnweightedTransducer; import net.sf.hfst.WeightedTransducer; /** * * @author ahmetaker */ public class Lemmatizer { static Transducer transducer = null; public final static long TRANSITION_TARGET_TABLE_START = 2147483648l; // 2^31 or UINT_MAX/2 rounded up public final static long NO_TABLE_INDEX = 4294967295l; public final static float INFINITE_WEIGHT = (float) 4294967295l; // this is hopefully the same as // static_cast<float>(UINT_MAX) in C++ public final static int NO_SYMBOL_NUMBER = 65535; // this is USHRT_MAX public static enum FlagDiacriticOperator { P, N, R, D, C, U }; public static String getLemma(String aResourceFolder, String aWord, String aLang, String aPOSType) throws IOException, NoTokenizationException, FormatException { if (transducer == null) { FileInputStream transducerfile = null; transducerfile = new FileInputStream(aResourceFolder + "/lemmaModels/" + aLang + ".hfst.ol"); TransducerHeader h = new TransducerHeader(transducerfile); DataInputStream charstream = new DataInputStream(transducerfile); TransducerAlphabet a = new TransducerAlphabet(charstream, h.getSymbolCount()); if (h.isWeighted()) { transducer = new WeightedTransducer(transducerfile, h, a); } else { transducer = new UnweightedTransducer(transducerfile, h, a); } } Collection<String> analyses = transducer.analyze(aWord); for (String analysis : analyses) { if ("en".equalsIgnoreCase(aLang)) { String grammar = "NONE"; String grammarCheck = "NONE"; if ("NOUN".equalsIgnoreCase(aPOSType)) { grammar = "\\[N\\]\\+N.*"; grammarCheck = "[N]+N"; } else if ("VERB".equalsIgnoreCase(aPOSType)) { grammar = "\\[V\\]\\+V.*"; grammarCheck = "[V]+V"; } else if ("ADJ".equalsIgnoreCase(aPOSType)) { grammar = "\\[ADJ\\]\\+ADJ.*"; grammarCheck = "[ADJ]+ADJ"; } else if ("ADV".equalsIgnoreCase(aPOSType)) { grammar = "\\[ADV\\]\\+ADV.*"; grammarCheck = "[ADV]+ADV"; } //System.out.println(analysis); if (analysis.contains(grammarCheck)) { String lemma = analysis.replaceAll(grammar, ""); if ((lemma.contains("+") && !lemma.contains("-")) && (aWord.contains("-") && !aWord.contains("+"))) { lemma = lemma.replaceAll("\\+", "-"); } if (lemma.contains("+") && !aWord.contains("+")) { lemma = lemma.replaceAll("\\+", ""); } return lemma.toLowerCase(); } } else if ("de".equalsIgnoreCase(aLang)) { String grammar = "NONE"; String grammar2 = ">"; String grammarCheck = "NONE"; if ("NOUN".equalsIgnoreCase(aPOSType)) { grammar = "<\\+NN>.*"; grammarCheck = "<+NN>"; } else if ("VERB".equalsIgnoreCase(aPOSType)) { grammar = "<\\+V>.*"; grammarCheck = "<+V>"; } else if ("ADJ".equalsIgnoreCase(aPOSType)) { grammar = "<\\+ADJ>.*"; grammarCheck = "<+ADJ>"; } else if ("ADV".equalsIgnoreCase(aPOSType)) { grammar = "<\\+ADV>.*"; grammarCheck = "<+ADV>"; } else if ("CONJ".equalsIgnoreCase(aPOSType)) { grammar = "<\\+KONJ>.*"; grammarCheck = "<+KONJ>"; } //System.out.println(analysis); if (analysis.contains(grammarCheck)) { String remaining = analysis.replaceAll(grammar, ""); String vals[] = remaining.split(grammar2); StringBuffer buffer = new StringBuffer(); String suffix = ""; for (int i = 0; i < vals.length - 1; i++) { String val = vals[i]; //System.out.println(val); if (!val.startsWith("<CAP")) { val = val.replaceAll("<.*", ""); buffer.append(val.toLowerCase()); } } String lastWord = vals[vals.length - 1].toString().replaceAll("<.*", ""); if (lastWord.endsWith("<SUFF")) { suffix = lastWord.toLowerCase(); } String result = null; // if (aWord.toLowerCase().startsWith(buffer.toString() + "s") && !buffer.toString().trim().equals("") && !secondWord.startsWith("s")) { // result = buffer.append("s").append(vals[vals.length - 1].toLowerCase()).toString().replaceAll("<.*", ""); // } else if (aWord.toLowerCase().equals(buffer.toString())) { return aWord.toLowerCase(); } else { String lastChar = lastWord.substring(lastWord.length()-1, lastWord.length()); String local = buffer.toString() + lastChar; //System.out.println(local); if (local.equalsIgnoreCase(aWord)) { return local; } String last2Char = lastWord.substring(lastWord.length()-2, lastWord.length()); local = buffer.toString() + last2Char; //System.out.println(local); if (local.equalsIgnoreCase(aWord)) { return local; } } if (aWord.toLowerCase().startsWith(buffer.toString()) && !buffer.toString().trim().equals("")) { String wordRemaining = aWord.toLowerCase().replaceAll(buffer.toString(), ""); wordRemaining = wordRemaining.replaceAll(lastWord.toLowerCase(), ""); if (!wordRemaining.trim().equals("") && wordRemaining.trim().length() <= 2) { if (!suffix.equals("")) { result = buffer.append(wordRemaining).toString(); } else { String local = buffer.toString() + lastWord.toLowerCase().toString(); if (aWord.toLowerCase().startsWith(local)) { result = local; } else { //System.out.println("hep " + aWord + " _ " + buffer.toString() + " _ " + vals[vals.length - 1].toLowerCase().toString().replaceAll("<.*", "") + " _ " + wordRemaining); result = buffer.append(wordRemaining).append(lastWord.toLowerCase()).toString(); } } } else { result = buffer.append(lastWord.toLowerCase()).toString(); } } else if (buffer.toString().trim().equals("")) { result = buffer.append(vals[vals.length - 1].toLowerCase()).toString().replaceAll("<.*", ""); } if (result != null) { result = result.replaceAll("\\{", "").replaceAll("\\}", ""); } return result; } } else if ("it".equalsIgnoreCase(aLang)) { String grammar = "NONE"; String grammarCheck = "NONE"; if ("NOUN".equalsIgnoreCase(aPOSType)) { grammar = "#NOUN.*"; grammarCheck = "#NOUN"; } else if ("VERB".equalsIgnoreCase(aPOSType)) { grammar = "#VER.*"; grammarCheck = "#VER"; } else if ("ADJ".equalsIgnoreCase(aPOSType)) { grammar = "#ADJ.*"; grammarCheck = "#ADJ"; } else if ("ADV".equalsIgnoreCase(aPOSType)) { grammar = "#ADV.*"; grammarCheck = "#ADV"; } else if ("CONJ".equalsIgnoreCase(aPOSType)) { grammar = "#CON.*"; grammarCheck = "#CON"; } //System.out.println(analysis); if (analysis.contains(grammarCheck)) { String lemma = analysis.replaceAll(grammar, ""); if ((lemma.contains("+") && !lemma.contains("-")) && (aWord.contains("-") && !aWord.contains("+"))) { lemma = lemma.replaceAll("\\+", "-"); } if (lemma.contains("+") && !aWord.contains("+")) { lemma = lemma.replaceAll("\\+", ""); } return lemma.toLowerCase(); } } else if ("fr".equalsIgnoreCase(aLang)) { String grammar = "NONE"; String grammarCheck = "NONE"; if ("NOUN".equalsIgnoreCase(aPOSType)) { grammar = "\\+commonNoun.*"; grammarCheck = "+commonNoun"; } else if ("VERB".equalsIgnoreCase(aPOSType)) { grammar = "\\+verb+.*"; grammarCheck = "+verb+"; } else if ("ADJ".equalsIgnoreCase(aPOSType)) { grammar = "\\+adjective.*"; grammarCheck = "+adjective"; } else if ("ADV".equalsIgnoreCase(aPOSType)) { grammar = "\\+adverb.*"; grammarCheck = "+adverb"; } else if ("PRON".equalsIgnoreCase(aPOSType) || "CONJ".equalsIgnoreCase(aPOSType)) { grammar = "\\+functionWord.*"; grammarCheck = "+functionWord"; } //System.out.println(analysis); if (analysis.contains(grammarCheck)) { String lemma = analysis.replaceAll(grammar, ""); if ((lemma.contains("+") && !lemma.contains("-")) && (aWord.contains("-") && !aWord.contains("+"))) { lemma = lemma.replaceAll("\\+", "-"); } if (lemma.contains("+") && !aWord.contains("+")) { lemma = lemma.replaceAll("\\+", ""); } return lemma.toLowerCase(); } } } if (analyses.isEmpty()) { return null; } return null; } public static void main(String args[]) throws IOException, NoTokenizationException, FormatException { URL url = Lemmatizer.class.getResource(""); String lemma = Lemmatizer.getLemma(url.getFile(), "hochenergetischen", "de", "adj"); //lemma = "M" + lemma.substring(1); System.out.println(lemma); } }