/******************************************************************************* cam * Trombone is a flexible text processing and analysis library used * primarily by Voyant Tools (voyant-tools.org). * * Copyright (©) 2007-2012 Stéfan Sinclair & Geoffrey Rockwell * * This file is part of Trombone. * * Trombone is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Trombone is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Trombone. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package org.voyanttools.trombone.input.extract; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import org.apache.commons.io.IOUtils; import org.voyanttools.trombone.input.source.InputSource; import org.voyanttools.trombone.input.source.Source; import org.voyanttools.trombone.model.DocumentFormat; import org.voyanttools.trombone.model.DocumentMetadata; import org.voyanttools.trombone.model.StoredDocumentSource; import org.voyanttools.trombone.storage.StoredDocumentSourceStorage; import org.voyanttools.trombone.util.FlexibleParameters; /** * @author sgs * */ public class StoredDocumentSourceExtractor { /** * all parameters sent, only some of which may be relevant to some expanders */ private FlexibleParameters parameters; /** * the storage strategy to use for storing document sources */ private StoredDocumentSourceStorage storedDocumentSourceStorage; private TikaExtractor tikaExtractor = null; private XmlExtractor xmlExtractor = null; private BagItExtractor bagItExtractor = null; // static { // try { //// DetectorFactory.loadProfiles("af","am","ar","az","be","bg","bn","bo","ca","cs","cy","da","de","dv","el","en","es","et","eu","fa","fi","fo","fr","ga","gn","gu","he","hi","hr","hu","hy","id","is","it","ja","jv","ka","kk","km","kn","ko","ky","lb","lij","ln","lt","lv","mi","mk","ml","mn","mr","mt","my","ne","nl","no","os","pa","pl","pnb","pt","qu","ro","ru","si","sk","so","sq","sr","sv","sw","ta","te","th","tk","tl","tr","tt","ug","uk","ur","uz","vi","yi","yo","zh-cn","zh-tw"); // } catch (LangDetectException e) { // throw new IllegalStateException("Unable to initiate language detection profiles", e); // } // } public StoredDocumentSourceExtractor( StoredDocumentSourceStorage storedDocumentSourceStorage, FlexibleParameters parameters) { this.storedDocumentSourceStorage = storedDocumentSourceStorage; this.parameters = parameters; } public List<StoredDocumentSource> getExtractedStoredDocumentSources(List<StoredDocumentSource> storedDocumentSources) throws IOException { List<StoredDocumentSource> extractedStoredDocumentSources = new ArrayList<StoredDocumentSource>(); int processors = Runtime.getRuntime().availableProcessors(); ExecutorService executor = Executors.newFixedThreadPool(processors); List<Future<StoredDocumentSource>> list = new ArrayList<Future<StoredDocumentSource>>(); boolean verbose = parameters.getParameterBooleanValue("verbose"); for (StoredDocumentSource storedDocumentSource : storedDocumentSources) { Callable<StoredDocumentSource> worker = new CallableExtractor(this, storedDocumentSource, verbose); Future<StoredDocumentSource> submit = executor.submit(worker); list.add(submit); } try { for (Future<StoredDocumentSource> future : list) { extractedStoredDocumentSources.add(future.get()); } } catch (InterruptedException e) { throw new IllegalStateException("An error occurred during multi-threaded document expansion.", e); } catch (ExecutionException e) { throw new IllegalStateException("An error occurred during multi-threaded document expansion.", e); } executor.shutdown(); return extractedStoredDocumentSources; } public StoredDocumentSource getExtractedStoredDocumentSource( StoredDocumentSource storedDocumentSource) throws IOException { DocumentFormat format; String inputFormatString = parameters.getParameterValue("inputFormat", "").toUpperCase(); if (inputFormatString.isEmpty()==false && inputFormatString.toUpperCase().equals("PBLIT")==false) { format = DocumentFormat.getForgivingly(inputFormatString); if (format==DocumentFormat.UNKNOWN) { // allow this to be set, especially for XML with an input format definition format = storedDocumentSource.getMetadata().getDocumentFormat(); } } else { format = storedDocumentSource.getMetadata().getDocumentFormat(); } if (format==DocumentFormat.UNKNOWN && storedDocumentSource.getMetadata().getSource()==Source.STRING) { String string = IOUtils.toString(storedDocumentSourceStorage.getStoredDocumentSourceInputStream(storedDocumentSource.getId())); format = DocumentFormat.fromString(string); if (format != DocumentFormat.UNKNOWN) { DocumentMetadata metadata = storedDocumentSource.getMetadata(); metadata.setDefaultFormat(format); storedDocumentSourceStorage.updateStoredDocumentSourceMetadata(storedDocumentSource.getId(), metadata); } } InputSource extractedInputSource; if (format.isXml()) { if (xmlExtractor==null) {xmlExtractor = new XmlExtractor(storedDocumentSourceStorage, parameters);} extractedInputSource = xmlExtractor.getExtractableInputSource(storedDocumentSource); } else if (format==DocumentFormat.BAGIT) { if (bagItExtractor==null) {bagItExtractor = new BagItExtractor(storedDocumentSourceStorage, parameters);} extractedInputSource = bagItExtractor.getExtractableInputSource(storedDocumentSource); } else { if (tikaExtractor==null) {tikaExtractor = new TikaExtractor(storedDocumentSourceStorage, parameters);} extractedInputSource = tikaExtractor.getExtractableInputSource(storedDocumentSource); } return storedDocumentSourceStorage.getStoredDocumentSource(extractedInputSource); } private class CallableExtractor implements Callable<StoredDocumentSource> { private StoredDocumentSourceExtractor extractor; private StoredDocumentSource storedDocumentSource; private boolean verbose; public CallableExtractor( StoredDocumentSourceExtractor storedDocumentSourceExtractor, StoredDocumentSource storedDocumentSource, boolean verbose) { this.extractor = storedDocumentSourceExtractor; this.storedDocumentSource = storedDocumentSource; this.verbose = verbose; } @Override public StoredDocumentSource call() throws Exception { // if (verbose) {System.out.println("extracting "+storedDocumentSource.getMetadata());} return this.extractor.getExtractedStoredDocumentSource(storedDocumentSource); } } }