package org.voyanttools.trombone.tool.build;
import static org.junit.Assert.*;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.FileUtils;
import org.apache.tika.io.IOUtils;
import org.junit.Test;
import org.voyanttools.trombone.model.StoredDocumentSource;
import org.voyanttools.trombone.storage.Storage;
import org.voyanttools.trombone.tool.build.DocumentExtractor;
import org.voyanttools.trombone.tool.build.DocumentStorer;
import org.voyanttools.trombone.util.FlexibleParameters;
import org.voyanttools.trombone.util.TestHelper;
import com.google.gson.Gson;
import com.google.gson.internal.StringMap;
import com.thoughtworks.xstream.XStream;
import com.thoughtworks.xstream.io.json.JsonHierarchicalStreamDriver;
public class DocumentExtractorTest {
@Test
public void test() throws IOException {
FlexibleParameters parameters = new FlexibleParameters(new String[]{"file="+TestHelper.getResource("formats/chars.rtf")});
Storage storage = TestHelper.getDefaultTestStorage();
// store
DocumentStorer storer = new DocumentStorer(storage, parameters);
storer.run();
// extract
parameters.setParameter("storedId", storer.getStoredId());
DocumentExtractor extractor = new DocumentExtractor(storage, parameters);
extractor.run();
List<StoredDocumentSource> storedDocumentSources = extractor.getStoredDocumentSources();
// make sure we have some plausible content
String line = FileUtils.readLines(TestHelper.getResource("formats/chars_utf8.txt")).get(0).trim();
line = line.substring(line.indexOf("I"));
String original;
InputStream is = null;
try {
String id = storedDocumentSources.get(0).getId();
is = storage.getStoredDocumentSourceStorage().getStoredDocumentSourceInputStream(id);
original = IOUtils.toString(is);
}
finally {
if (is!=null) is.close();
}
assertTrue(original.contains(line));
// ensure we have two documents
assertEquals(1, storedDocumentSources.size());
XStream xstream;
// serialize to XML
xstream = new XStream();
xstream.autodetectAnnotations(true);
String xml = xstream.toXML(extractor);
assertTrue(xml.startsWith("<extractedStoredDocuments>"));
Matcher matcher = Pattern.compile("<storedId>(.+?)</storedId>").matcher(xml);
assertTrue(matcher.find()); // we should match
String id = matcher.group(1);
List<String> ids = storage.retrieveStrings(id, Storage.Location.object);
for (int i=0, len=ids.size(); i<len; i++) {
assertEquals(ids.get(i),storedDocumentSources.get(i).getId());
}
// serialize to JSON
xstream = new XStream(new JsonHierarchicalStreamDriver());
xstream.autodetectAnnotations(true);
String json = xstream.toXML(extractor);
Gson gson = new Gson();
StringMap<StringMap> obj = gson.fromJson(json, StringMap.class);
StringMap<String> sd = obj.get("extractedStoredDocuments");
String idString = (String) sd.get("storedId");
assertEquals(id, idString); }
}