/*******************************************************************************
* Trombone is a flexible text processing and analysis library used
* primarily by Voyant Tools (voyant-tools.org).
*
* Copyright (©) 2007-2012 Stéfan Sinclair & Geoffrey Rockwell
*
* This file is part of Trombone.
*
* Trombone is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Trombone is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Trombone. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
package org.voyanttools.trombone.tool.build;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.List;
import org.voyanttools.trombone.input.extract.StoredDocumentSourceExtractor;
import org.voyanttools.trombone.model.DocumentMetadata;
import org.voyanttools.trombone.model.StoredDocumentSource;
import org.voyanttools.trombone.storage.Storage;
import org.voyanttools.trombone.storage.StoredDocumentSourceStorage;
import org.voyanttools.trombone.tool.utils.AbstractTool;
import org.voyanttools.trombone.util.FlexibleParameters;
import com.thoughtworks.xstream.annotations.XStreamAlias;
import com.thoughtworks.xstream.annotations.XStreamOmitField;
/**
* @author sgs
*
*/
@XStreamAlias("extractedStoredDocuments")
class DocumentExtractor extends AbstractTool {
private String storedId = null;
@XStreamOmitField
private List<StoredDocumentSource> storedDocumentSources = new ArrayList<StoredDocumentSource>();
/**
* @param storage
* @param parameters
*/
DocumentExtractor(Storage storage, FlexibleParameters parameters) {
super(storage, parameters);
// TODO Auto-generated constructor stub
}
@Override
public void run() throws IOException {
String sid = parameters.getParameterValue("storedId");
List<String> ids = storage.retrieveStrings(sid, Storage.Location.object);
StoredDocumentSourceStorage storedDocumentStorage = storage.getStoredDocumentSourceStorage();
List<StoredDocumentSource> extractableStoredDocumentSources = new ArrayList<StoredDocumentSource>();
for (String id : ids) {
DocumentMetadata metadata = storedDocumentStorage.getStoredDocumentSourceMetadata(id);
StoredDocumentSource storedDocumentSource = new StoredDocumentSource(id, metadata);
extractableStoredDocumentSources.add(storedDocumentSource);
}
run(extractableStoredDocumentSources);
}
void run(List<StoredDocumentSource> extractableStoredDocumentSources) throws IOException {
Calendar start = Calendar.getInstance();
log("Starting document extraction.");
StoredDocumentSourceStorage storedDocumentStorage = storage.getStoredDocumentSourceStorage();
StoredDocumentSourceExtractor extractor;
if (parameters.containsKey("inputFormat") && parameters.getParameterValue("inputFormat").toUpperCase().equals("PBLIT")) {
extractor = new StoredDocumentSourceExtractor(storedDocumentStorage, new FlexibleParameters());
} else {
extractor = new StoredDocumentSourceExtractor(storedDocumentStorage, parameters);
}
storedDocumentSources = extractor.getExtractedStoredDocumentSources(extractableStoredDocumentSources);
// sort documents if needed
if (parameters.containsKey("sort")) {
Collections.sort(storedDocumentSources, StoredDocumentSource.getComparator(parameters));
}
List<String> extractedIds = new ArrayList<String>();
for (StoredDocumentSource storedDocumentSource : storedDocumentSources) {
extractedIds.add(storedDocumentSource.getId());
}
storedId = storage.storeStrings(extractedIds, Storage.Location.object);
log("Finished extraction of "+extractedIds.size()+" documents.", start);
}
List<StoredDocumentSource> getStoredDocumentSources() {
return storedDocumentSources;
}
String getStoredId() {
return storedId;
}
}