/** * Stemming * Copyright 2012 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany * first published 01.10.2012 on http://yacy.net * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see <http://www.gnu.org/licenses/>. */ package net.yacy.cora.language.synonyms; import java.io.File; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.BlockingQueue; import net.yacy.cora.storage.Files; import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; /** * Stemming library: reads stemming files and creates a mapping from words to synonyms * Stemming files must have a list of synonym words in each line of the input file. * The words within one line must be separated by ','. Lines starting with '#' are * comment files and are ignored. Each line can (but does not need to) have a '{' * at the beginning of the line and '}' at the end (which would be the GSA format). */ public class SynonymLibrary { private final static ConcurrentLog log = new ConcurrentLog(SynonymLibrary.class.getName()); private final static Map<String, List<Set<String>>> lib = new HashMap<String, List<Set<String>>>(); public static void init(final File path) { lib.clear(); if (!path.exists() || !path.isDirectory()) return; final String[] files = path.list(); /* Global map of all known distinct words : thus enable reuse of the same word String instance * appearing multiple times in different synonyms sets */ final Map<String, String> distinctWords = new HashMap<>(); for (final String f: files) { File ff = new File(path, f); String line; try { BlockingQueue<String> list = Files.concurentLineReader(ff); while ((line = list.take()) != Files.POISON_LINE) { line = line.trim(); if (line.length() == 0 || line.charAt(0) == '#') continue; if (line.charAt(line.length() - 1) == '}') line = line.substring(0, line.length() - 1); if (line.charAt(0) == '{') line = line.substring(1); String[] words = CommonPattern.COMMA.split(line); Set<String> synonyms = new HashSet<String>(words.length); Set<String> keys = new HashSet<String>(words.length); for (String word: words) { word = word.trim(); if (word.length() < 2) continue; String lowCaseWord = word.toLowerCase(); String kownWord = distinctWords.get(lowCaseWord); if(kownWord != null) { /* This word is already known : let's use the existing String instance from the synonyms map to gain memory space */ lowCaseWord = kownWord; } else { /* First encounter of this word : let's add it to the global map of known words */ distinctWords.put(lowCaseWord, lowCaseWord); } synonyms.add(lowCaseWord); keys.add(lowCaseWord.substring(0, 2)); } for (String key: keys) { List<Set<String>> symsetlist = lib.get(key); if (symsetlist == null) { symsetlist = new ArrayList<Set<String>>(); lib.put(key, symsetlist); } symsetlist.add(synonyms); } } } catch (final Throwable e) { log.warn("cannot read stemming file " + f, e); } } } public static int size() { return lib.size(); } /** * for a given word, return a list of synonym words * @param word * @return a list of synonyms but without the requested word */ public static Set<String> getSynonyms(String word) { if (word == null) return null; word = word.toLowerCase().trim(); if (word.length() < 2) return null; String key = word.substring(0, 2); List<Set<String>> symsetlist = lib.get(key); if (symsetlist == null) return null; for (Set<String> symset: symsetlist) { if (symset.contains(word)) { // create a new set containing all but the one word Set<String> returnSet = new HashSet<String>(); for (String synonym: symset) { if (synonym.equals(word)) continue; returnSet.add(synonym); } return returnSet; } } return null; } }