package com.ontotext.kim.model; import gate.creole.ResourceInstantiationException; import gate.util.profile.Profiler; import gnu.trove.TIntHashSet; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.rmi.RemoteException; import java.util.*; import org.apache.commons.collections.Transformer; import org.apache.commons.io.FileUtils; import org.apache.log4j.Logger; import org.openrdf.model.URI; import org.openrdf.model.impl.URIImpl; import com.ontotext.kim.KIMConstants; import com.ontotext.kim.client.KIMRuntimeException; import com.ontotext.kim.client.query.KIMQueryException; import com.ontotext.kim.client.semanticrepository.QueryResultListener; import com.ontotext.kim.client.semanticrepository.QueryResultListener.Feed; import com.ontotext.kim.gate.KimLookupParser; import com.ontotext.kim.gate.KimLookupParser.AliasLookupDictionary; import com.ontotext.kim.util.ListReader; import com.ontotext.kim.util.StringTransformations; /** * This class is designed to serve as an Alias dictionary for the * <code>KimGazetteer</code>. It is used to store an image of the textual * representations (Aliases) of the known objects (Entities) described in the * KIM platform knowledge-base. Then it allows to check text fragments for * the presence of known Aliases.<br> * The aliases are not stored as plain text but as couple of hash codes. * The storage is implemented by the couple of classes <code>HashedAlias</code> * and <code>HashRegister</code>. Because of this specific - before storing * of an Alias and also before checking a text fragment - they both must * be pre-processed. The pre-processing is implemented in couple of classes. * The class <code>AliasTextTransformer</code> does preliminary text * normalization and the class <code>ParsingFrame</code> does further * normalization and hash-codes calculation.<br> * <br> * An instance of the <code>AliasCacheImpl</code> class is obtained through * a synchronized factory method. * * @author danko * */ public class AliasCacheImpl implements AliasLookupDictionary { protected static Logger log = Logger.getLogger(AliasCacheImpl.class); private static DataFeedFactory feedFactory = new DataFeedFactory(); /** The register containing the <code>HashedAlias</code> instances */ protected HashRegister aliasRegister; /** The set of hash-codes of valid alias prefixes. This set is used in * text parsing and lookup phase. It helps to determine if an attempt * must be made to expand the span of the <code>ParsingFrame</code> used * to parse the searched text. An expansion is made only if currently * framed text generates <b>Alias-Hash-1</b> code which appears in the * <code>aliasPrefixes</code> set. (For details see * <code>ParsingFrame</code>) */ protected TIntHashSet aliasPrefixes; /** Additional register which allows fast checking if a given Entity's * aliases has been stored in the alias register. */ protected HashRegister aliasInstRegister; /** Array used for encoding/decoding the instance URI's name-spaces */ protected ArrayList<String> instNS; /** Array used for encoding/decoding the semantic class URIs */ protected ArrayList<String> classCache; /** Additional register containing exactly appointed aliases, which * must be ignored on storing */ protected HashRegister aliasToIgnore; /** The general case sensitivity selector of the Alias cache */ private String caseSensitivity; protected AliasCacheImpl (String caseSensitive) { this.caseSensitivity = caseSensitive; } //========================================================================= // Alias Cache: Instance Generation section //========================================================================= private static Map<File, LoadedCache> aliasDictionaries = new HashMap<File, LoadedCache>(); private static Object instanceLock = new Object(); public static AliasCacheImpl getInstance() throws ResourceInstantiationException { String caseSens = System.getProperty("com.ontotext.kim.KIMConstants.ENTITY_CASE_TYPE", KIMConstants.CASE_INSENSITIV); return getInstance(new File(KIMConstants.KIM_CACHE_PATH), caseSens, "<unknown>"); } private static class LoadedCache { public AliasCacheImpl cache; public List<String> clients = new LinkedList<String>(); } /** * A static method for generation/access to the one and only instance * of the alias cache * @param dictionaryPath * @param caseSens * @return - the instance of the cache * @throws ResourceInstantiationException */ public static AliasCacheImpl getInstance(File dictionaryPath, String caseSens, String clientId) throws ResourceInstantiationException { synchronized(instanceLock) { if ( !aliasDictionaries.containsKey(dictionaryPath)) { LoadedCache lc = new LoadedCache(); lc.cache = createInstance(dictionaryPath, caseSens); aliasDictionaries.put(dictionaryPath, lc); } } LoadedCache lc = aliasDictionaries.get(dictionaryPath); lc.clients.add(clientId); return lc.cache; } public static void releaseCache(File dictionaryPath, String clientId) { synchronized(instanceLock) { LoadedCache lc = aliasDictionaries.get(dictionaryPath); if (lc == null) return; lc.clients.remove(clientId); if (lc.clients.isEmpty()) aliasDictionaries.remove(dictionaryPath); else { log.info("The cache for " + dictionaryPath + " will not be unloaded or reloaded because some clients remain: " + lc.clients); } } } public static AliasCacheImpl createInstance(File dictionaryPath, String caseSens) throws ResourceInstantiationException { AliasCacheImpl aliasCacheInstance = new AliasCacheImpl(caseSens); Feed feed = feedFactory.createFeed(dictionaryPath); Set<String> ignoreList = ListReader.fromFile(KIMConstants.KIM_GAZETEER_IGNORE_LIST_PATH,log); try { aliasCacheInstance.initCache(ignoreList, feed, dictionaryPath); } catch (RemoteException e) { throw new ResourceInstantiationException(e); } return aliasCacheInstance; } //========================================================================= // Alias Cache Persistence section //========================================================================= /** * Checks whether an alias can be added to the cache, honoring the * caseSensitivity setting.<br> * <br> * The ignore list check is performed here.<br> * * @param alias * @return whether the alias can be added */ private boolean verifyAlias(String alias) { if (alias==null || alias.trim().length() == 0) return false; alias = (String) ParsingFrame.frameTT.transform(alias); if (aliasToIgnore.exists(alias.hashCode(), alias)) { log.info("'" + alias + "' ignored, because it was found in the ignore list."); return false; } return true; } /** This class is used in deserialization process to initialize the * <code>aliasInstRegister</code> register */ private static class InstanceRegisterLoader implements HashRegister.ContentProcessor { HashRegister instRegister; public InstanceRegisterLoader(HashRegister register) { instRegister = register; } public void process(Object[] elements) { if (elements != null) { for (int i=0; i<elements.length; i++) { String shortInst = ((HashedAlias) elements[i]).shortInstURI; instRegister.add(shortInst.hashCode(), shortInst); } } } } /** This is a blank initialization method. It creates the structure of a new * Alias cache without filling it with content. This code is separated from * the data filling from the semantic repository - for class extension * convenience.<br> * @param ignoreAliases - a String list of aliases to be ignored. */ protected void initBlankCache(Collection<String> ignoreAliases) { aliasRegister = new HashRegister(); aliasPrefixes = new TIntHashSet(); aliasInstRegister = new HashRegister(); instNS = new ArrayList<String>(); classCache = new ArrayList<String>(); // Create a TextTransformer instance for Alias text normalization Transformer tt = new AliasTextTransformer( caseSensitivity.equals(KIMConstants.CASE_INSENSITIV)); ParsingFrame.frameTT = tt; aliasToIgnore = new HashRegister(); if (ignoreAliases != null) { for (String alias : ignoreAliases) { // Apply same text normalization to the aliases to be ignored alias = (String)tt.transform(alias); aliasToIgnore.add(alias.hashCode(), alias); if (caseSensitivity.equals(KIMConstants.CASE_SENSITIV_ALLUPPER)) { alias = alias.toUpperCase(); aliasToIgnore.add(alias.hashCode(), alias); } } } log.info( "Aliases in IGNORE list:" + aliasToIgnore.getElementsCount()); } /** This method implements the default full initialization process. * It creates an empty Alias cache and then fills it with data. The data * is collected either from the semantic repository or from a serialization * source (a file). * @param ignoreAliases - a String list of aliases to be ignored. * @param semRep * @param dictionaryPath * @throws RemoteException - on failure to access the semantic repository. */ protected void initCache( Collection<String> ignoreAliases, QueryResultListener.Feed dataFeed, File dictionaryPath) throws RemoteException { Profiler pro = new Profiler(); pro.enableGCCalling(false); pro.printToSystemOut(true); pro.initRun("Loading of Entities Cache"); pro.checkPoint("start loading"); initBlankCache(ignoreAliases); File fileTCache = new File(dictionaryPath, "kim.trusted.entities.cache").getAbsoluteFile(); File flagTCache = new File(dictionaryPath, fileTCache.getName() + ".flag"); try { ensureCachePath(dictionaryPath); } catch (IOException e1) { log.error( "Could not create entity cache.", e1); } boolean flagTLoaded = false; if (!flagTCache.exists() && fileTCache.exists()) { log.info("Loading of trusted entities from " + fileTCache); try { ObjectInputStream ois = new ObjectInputStream( new FileInputStream(fileTCache)); Object[] res = (Object[]) ois.readObject(); ois.close(); aliasRegister = (HashRegister) res[0]; aliasPrefixes = (TIntHashSet) res[1]; instNS = (ArrayList<String>) res[2]; classCache = (ArrayList<String>) res[3]; aliasInstRegister = new HashRegister(); // The exactly same Entity InstURI strings are reused aliasRegister.processContent( new InstanceRegisterLoader(aliasInstRegister)); flagTLoaded = true; log.info(aliasRegister.getElementsCount() + " elements loaded."); } catch (Exception e) { log.error("Loading from " + fileTCache + " failed. " + "Continue with loading from Semantic Repository.", e); } } if (!flagTLoaded) { loadTrustedMaps(dataFeed); try { flagTCache.createNewFile(); if (fileTCache.exists()) fileTCache.delete(); ObjectOutputStream oos = new ObjectOutputStream( new FileOutputStream(fileTCache)); oos.writeObject( new Object[]{aliasRegister, aliasPrefixes, instNS, classCache}); oos.close(); flagTCache.delete(); } catch (Exception ex) { log.error("Saving of trusted entities to " + fileTCache + " failed.", ex); } } log.info("Aliases were loaded"); pro.checkPoint("cache loaded"); } private void ensureCachePath(File cachePath) throws IOException { if (cachePath.exists() && !cachePath.isDirectory()) FileUtils.forceDelete(cachePath); if (!cachePath.exists()) FileUtils.forceMkdir(cachePath); } private void loadTrustedMaps(QueryResultListener.Feed dataFeed) { log.info("Loading of trusted entities from Sesame"); String filePath = System.getProperty("kim.home.dir", ".") + EntityPriority.PRIORITY_CONF_FILE.substring(1); existsClassPriority = (new File(filePath)).exists(); if (existsClassPriority) { try { entPrior = new EntityPriority(); entPrior.init(); existsClassPriority = existsClassPriority && entPrior.getFilterLookups(); } catch (Exception e) { log.error( "Cannot create instance of Priorities class", e); entPrior = null; } } EntitiesQueryListener entityListener = new TrustedEntitiesListener(entPrior); // Handler to preserve the same inner listener for the two queries if ( log.isDebugEnabled() ) { entityListener = StatisticListener.wrap(entityListener, "Thrusted Entities"); } try { // semRep.evaluateSelectSeRQL(, entityListener); dataFeed.feedTo(entityListener); } catch (KIMQueryException e) { throw new KIMRuntimeException("The loading failed.", e); } finally { log.info("The loading from Sesame finished"); } } /** A class extending the <code>EntitiesQueryListener</code>, which is * used to process the input from the semantic repository. It is used * only when the data is loaded from there. */ class TrustedEntitiesListener extends EntitiesQueryListener { private final EntityPriority m_entPrior; TrustedEntitiesListener(EntityPriority m_entPrior) { this.m_entPrior = m_entPrior; } @Override protected void addEntity(String instUri, String classUri, String aliasLabel) { addAlias(instUri, classUri, aliasLabel, true); } @Override public void endTableQueryResult() throws IOException { super.endTableQueryResult(); if (existsClassPriority && allPrioritiesCompetition != null) { Iterator it = allPrioritiesCompetition.keySet().iterator(); while (it.hasNext()) { ArrayList<priorityCompetition> pcList = allPrioritiesCompetition.get(it.next()); int maxPrior = pcList.get(0).maxPriority; int treshold = m_entPrior.getThreshold(); for (int i = 0; i < pcList.size(); i++) { priorityCompetition pc = pcList.get(i); if (i == 0 || maxPrior - pc.maxPriority <= treshold) addAlias(pc.instURI, pc.classURI, pc.alias, false); } } allPrioritiesCompetition = null; } } } //========================================================================= // Alias Cache: Statistics Collection section //========================================================================= /** The class is used to collect timing data for profiling purposes */ public static class Stats { private static boolean doStats = false; private static final String[] statNames = { "AA_PrefixStore", "AA_URIStringReuse", "AA_InstUriRegisterInsert", "AA_AliasRegisterInsert", "AL_PreParsing", "AL_GetByAliasHash1", "AL_FilterByAliasHash2", "PF_MakeFrame", "PF_Find", "PF_RecalcFrame", "PF_MakeFrameSnapshot" }; private static long[] statTimes = new long[statNames.length]; public static void restartStats() { doStats = true; Arrays.fill(statTimes, 0); } public static void stopStats() { doStats = false; } public static boolean doStats() { return doStats; } private static long curr; private static long last; public static void markIt(int index) { if (doStats) { curr = System.currentTimeMillis(); long duration = curr-last; if (index >= 0 && index < statTimes.length) statTimes[index] += duration; last = curr; } } public static void dumpStats() { for (int i=0; i<statNames.length; i++) if (statTimes[i] > 0) System.out.println( " " + statNames[i] + " = " + statTimes[i] + "ms."); } } //========================================================================= // Alias Cache: Population section //========================================================================= /** Adds an Alias with its instance and semantic class to the Alias cache. * A single call to this method could result in adding several records to * the alias cache. This is as a result to the standard Alias enrichment * logic which is applied over the given as input alias string. * @param instURI - the URI of the Entity instance corresponding to the * Alias * @param classURI - the URI of the semantic class * @param alias - the string of the alias * @param primaryAccess - processing specific flag; if <b>true</b> - forces * class priority checks. */ public void addAlias(String instURI, String classURI, String alias, boolean primaryAccess) { if (checkClassPriority(instURI, classURI, alias, primaryAccess)) return; String[] enriched = aliasEnrichment(alias); for (int i=0; i< enriched.length; i++) { if (verifyAlias(enriched[i])) simpleAddAlias(instURI, classURI, enriched[i]); } } /** This method performs the standard alias enrichment. It covers * cases as variants with and without trailing punctuation. * @param alias - the original alias string. * @return array of distinct strings which are accepted as equally valid * representation of the related to the Alias - Entity. */ private String[] aliasEnrichment(String alias) { HashSet<String> aliases = new HashSet<String>(); aliases.add(alias); String[] tmp; // Enrich with UPPER case versions if needed if (caseSensitivity.equals(KIMConstants.CASE_SENSITIV_ALLUPPER)) { tmp = aliases.toArray(new String[0]); for (int i=0; i<tmp.length; i++) { String tmpNew = tmp[i].toUpperCase(); if (!tmpNew.equals(tmp[i])) aliases.add(tmpNew); } } // Enrich with versions with stripped punctual suffix tmp = aliases.toArray(new String[0]); for (int i=0; i<tmp.length; i++) { String tmpNew = StringTransformations.stripPunctAtEnd(tmp[i]); if (!tmpNew.equals(tmp[i])) aliases.add(tmpNew); } return aliases.toArray(new String[0]); } /** Method that implements a simple alias addition (as is) to the * cache structures * @param instURI - the Entity instance URI * @param classURI - the semantic class URI * @param alias - the string of the alias */ private void simpleAddAlias(String instURI, String classURI, String alias) { String shortInstURI = packNS(instURI); // Calculate different hash values related with the alias ParsingFrame pfm = new ParsingFrame(alias); do { if (pfm.parseOne() && pfm.frameCanExpand()) { // Add the aliases prefixes to the alias prefix register Stats.markIt(-1); aliasPrefixes.add(pfm.getAliasHash1()); Stats.markIt(0); } } while (pfm.frameCanExpand()); // Add the corresponding instURI in the Entity instance URI register String oldURI = null; // Re-usage of the same string for EntityURI for all its Aliases // is expected to save 10% to 15% of the memory oldURI = (String) aliasInstRegister.get( shortInstURI.hashCode(), shortInstURI); Stats.markIt(1); if (oldURI != null) shortInstURI = oldURI; else aliasInstRegister.add(shortInstURI.hashCode(), shortInstURI); Stats.markIt(2); // Add the alias in the alias register aliasRegister.add( pfm.getAliasHash1(), new HashedAlias(pfm.getAliasHash2(), pfm.getPrefixLen(), pfm.getSuffixLen(), shortInstURI, packClass(classURI))); Stats.markIt(3); } //========================================================================= // Alias Cache: Retrieval section //========================================================================= /** * Looks up for matches given an Alias string. This is useful for * single lookups and searches for exact matches. This means no * prefix/suffix variations are accepted. * @param alias - the label string of the alias * @return - array of matching HashedAlias instances */ public ArrayList<KimLookupParser.AliasWrapper> lookup(String alias) { ParsingFrame pfm = new ParsingFrame(alias); pfm.parseAll(); return lookup(pfm, true); } public Collection<KimLookupParser.AliasWrapper> lookup(ParsingFrame pfm) { return lookup(pfm, false); } private ArrayList<KimLookupParser.AliasWrapper> lookup( ParsingFrame pfm, boolean exactlySame) { Stats.markIt(-1); ArrayList<KimLookupParser.AliasWrapper> res = new ArrayList<KimLookupParser.AliasWrapper>(); Object[] tmp = aliasRegister.get(pfm.getAliasHash1()); Stats.markIt(5); if (tmp == null || tmp.length==0) return res; for (int i=0; i<tmp.length; i++) { HashedAlias ha = (HashedAlias)tmp[i]; if (!exactlySame) { pfm.setNewPrefSufLen(ha.prefLen, ha.suffLen); } if (pfm.getAliasHash2() == ha.aliasHash2) { res.add(new KimLookupParser.AliasWrapper( unpackNS(ha.shortInstURI), unpackClass(ha.classID), pfm.getAliasStart(), pfm.getAliasEnd())); } } Stats.markIt(6); return res; } public boolean canPhraseGrow(ParsingFrame pfm) { return aliasPrefixes.contains(pfm.getAliasHash1()); } public boolean isTrustedEntityURI(String instURI) { String shortURI = packNS(instURI); return aliasInstRegister.exists(shortURI.hashCode(), shortURI); } public int getEntityCount() { return aliasInstRegister.getElementsCount(); } public int getAliasCount() { return aliasRegister.getElementsCount(); } //========================================================================= // Alias Cache: URI Pack/Unpack tools //========================================================================= private String packNS(String fullURI) { String ns, ln; try { URI uri = new URIImpl(fullURI); ns = uri.getNamespace(); ln = uri.getLocalName(); } catch (RuntimeException e) { ns = ""; ln = fullURI; } int j = instNS.indexOf(ns); if (j < 0) { instNS.add(ns); j = instNS.size()-1; } return j+":"+ln; } private String unpackNS(String shortURI) { if (shortURI == null) return null; int i = shortURI.indexOf(':'); String ns; try { ns = instNS.get(Integer.parseInt(shortURI.substring(0,i))); } catch (Exception e) { log.debug( "Short URI unpack failed:" + shortURI, e); return shortURI; } return ns + shortURI.substring(i+1); } private int packClass(String classURI) { int i = classCache.indexOf(classURI); if (i < 0) { classCache.add(classURI); i = classCache.size() - 1; } return i; } private String unpackClass(int clasID) { return classCache.get(clasID); } //====================================================== // Alias Cache: Class Priority competition & elimination //====================================================== class priorityCompetition { String instURI; String classURI; String alias; int maxPriority; priorityCompetition(String instURI, String classURI, String alias, int maxPriority) { this.instURI = instURI; this.classURI = classURI; this.alias = alias; this.maxPriority = maxPriority; } } protected HashMap<String, ArrayList<priorityCompetition>> allPrioritiesCompetition = new HashMap(); protected EntityPriority entPrior; protected boolean existsClassPriority = false; /** * TODO * * <p> * If current label class is 'competitive', the label is put in a Map, * having List * for all concurrent classes descending sorted by weight * because of small number of classes for a label, direct insert is * chosen in comparison with binary * @param instURI * @param classURI * @param alias * @param primaryAccess * @return */ private boolean checkClassPriority(String instURI, String classURI, final String alias, boolean primaryAccess) { boolean rejectedByPriority = false; if (primaryAccess && existsClassPriority) { URI origClass = new URIImpl(classURI); String priorityClassName = origClass.getLocalName(); rejectedByPriority = entPrior.m_hClassPrio.containsKey(priorityClassName); if (rejectedByPriority) { int mp = (Integer) entPrior.m_hClassPrio.get(priorityClassName); log.info("COMPETITION:" + "\t" + instURI + "\t" + classURI + "\t" + alias + "\t" + mp); if (!allPrioritiesCompetition.containsKey(alias)) allPrioritiesCompetition.put(alias, new ArrayList<priorityCompetition>()); ArrayList<priorityCompetition> pcList = allPrioritiesCompetition.get(alias); boolean foundLesser = false; for (int i = 0; i < pcList.size(); i++) { if (mp > pcList.get(i).maxPriority) { pcList.add(i, new priorityCompetition( instURI, classURI, alias, mp)); foundLesser = true; break; } } if (!foundLesser) pcList.add( new priorityCompetition( instURI, classURI, alias, mp)); } } return rejectedByPriority; } }