/******************************************************************************* * Trombone is a flexible text processing and analysis library used * primarily by Voyant Tools (voyant-tools.org). * * Copyright (©) 2007-2012 Stéfan Sinclair & Geoffrey Rockwell * * This file is part of Trombone. * * Trombone is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Trombone is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Trombone. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package org.voyanttools.trombone.model; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.URI; import java.net.URISyntaxException; import java.text.Normalizer; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.LinkedHashSet; import java.util.List; import java.util.Set; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.voyanttools.trombone.input.source.InputSource; import org.voyanttools.trombone.input.source.UriInputSource; import org.voyanttools.trombone.storage.Storage; import org.voyanttools.trombone.storage.StoredDocumentSourceStorage; import org.voyanttools.trombone.storage.file.FileMigrationFactory; import org.voyanttools.trombone.storage.file.FileStorage; import com.thoughtworks.xstream.annotations.XStreamAlias; import edu.stanford.nlp.util.StringUtils; /** * @author sgs * */ @XStreamAlias("keywords") public class Keywords { private static String COMMA_SEPARATOR = ","; private static String HTTP_PREFIX = "http:"; private static String HTTPS_PREFIX = "https:"; private static String STOPWORDS_FILE_PREFIX = "stop."; private static String KEYWORDS_PREFIX = "keywords-"; private static String COMMENT = "#"; private Set<String> keywords; /** * */ public Keywords() { keywords = new LinkedHashSet<String>(); } public boolean isKeyword(String keyword) { return keywords.contains(keyword); } public boolean isEmpty() { return keywords.isEmpty(); } public void load(Storage storage, String[] references) throws IOException { for (String ref : references) { ref = ref.trim(); if (ref.contains(",")) { // comma-separated references load(storage, ref.split(COMMA_SEPARATOR)); } else if (ref.startsWith(HTTP_PREFIX) || ref.startsWith(HTTPS_PREFIX)) { StoredDocumentSourceStorage storedDocumentSourceStorage = storage.getStoredDocumentSourceStorage(); URI uri; try { uri = new URI(ref); } catch (URISyntaxException e) { throw new IOException("Bad URI provided for keywords: "+ref); } InputSource inputSource = new UriInputSource(uri); StoredDocumentSource storedDocumentSource = storedDocumentSourceStorage.getStoredDocumentSource(inputSource); InputStream inputStream = null; try { inputStream = storedDocumentSourceStorage.getStoredDocumentSourceInputStream(storedDocumentSource.getId()); List<String> keys = IOUtils.readLines(inputStream); add(keys); } finally { if (inputStream!=null) { inputStream.close(); } } } else if (ref.startsWith(STOPWORDS_FILE_PREFIX)) { try(InputStream is = getClass().getResourceAsStream("/org/voyanttools/trombone/keywords/"+ref)) { List<String> refs = IOUtils.readLines(is); add(refs); } catch (IOException e) { throw new IOException("Unable to find local stopwords directory", e); } } else if (ref.startsWith(KEYWORDS_PREFIX)) { String refId = ref.substring(KEYWORDS_PREFIX.length()); try { List<String> refs = storage.retrieveStrings(refId, Storage.Location.object); add(refs); } catch (IOException e) { if (storage instanceof FileStorage) { File file = FileMigrationFactory.getStoredObjectFile((FileStorage) storage, refId); if (file!=null) { // add to lower case here, though not sure we want it this universal String contents = FileUtils.readFileToString(file).toLowerCase(); List<String> keywordsList = StringUtils.split(contents, "\n"); storage.storeStrings(keywordsList, refId, Storage.Location.object); add(keywordsList); } else { throw new IOException("Unable to load keyword file: "+ref); } } else { throw new IOException("Unable to load keyword file: "+ref); } } } else { // individual term, so let's add it keywords.add(ref); } } } public void sort() { List<String> strings = new ArrayList<String>(keywords); Collections.sort(strings, new Comparator<String>() { @Override public int compare(String s1, String s2) { return Normalizer.normalize(s1, Normalizer.Form.NFD).compareToIgnoreCase(Normalizer.normalize(s2, Normalizer.Form.NFD)); } }); keywords.clear(); keywords.addAll(strings); } Collection<String> getKeywords() { return keywords; } public void add(Collection<String> keywords) { for (String keyword : keywords) { if (keyword.trim().startsWith(COMMENT)==false) { for (String word : keyword.split(COMMA_SEPARATOR)) this.keywords.add(word.trim()); } } } }