package io.github.infolis.algorithm;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with this
* work for additional information regarding copyright ownership. The ASF
* licenses this file to You under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
import io.github.infolis.InfolisConfig;
import io.github.infolis.datastore.DataStoreClient;
import io.github.infolis.datastore.FileResolver;
import io.github.infolis.model.Execution;
import io.github.infolis.model.ExecutionStatus;
import io.github.infolis.model.entity.InfolisFile;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import javax.ws.rs.BadRequestException;
import javax.ws.rs.ProcessingException;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.NIOFSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Class for adding text files to a Lucene index.
*
* @author kata
* @author kba
*/
public class Indexer extends BaseAlgorithm {
//
private final static String INDEX_DIR_PREFIX = "infolis-index-";
public Indexer(DataStoreClient inputDataStoreClient, DataStoreClient outputDataStoreClient, FileResolver inputFileResolver, FileResolver outputFileResolver) {
super(inputDataStoreClient, outputDataStoreClient, inputFileResolver, outputFileResolver);
}
private Logger log = LoggerFactory.getLogger(Indexer.class);
public static Analyzer createAnalyzer() {
return new WhitespaceAnalyzer();
}
@Override
public void execute() throws IOException {
String indexPath;
if (null != getExecution().getIndexDirectory() && !getExecution().getIndexDirectory().isEmpty()) {
indexPath = getExecution().getIndexDirectory();
} else {
indexPath = Files.createTempDirectory(InfolisConfig.getTmpFilePath().toAbsolutePath(), INDEX_DIR_PREFIX).toString();
FileUtils.forceDeleteOnExit(new File(indexPath));
}
log.debug("Indexing to: " + indexPath);
getExecution().setOutputDirectory(indexPath);
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(createAnalyzer());
indexWriterConfig.setOpenMode(OpenMode.CREATE);
// An FSDirectory implementation that uses java.nio's FileChannel's positional read, which allows multiple threads to read from the same file without synchronizing.
// NIOFSDirectory is not recommended on Windows because of a bug in how FileChannel.read is implemented in Sun's JRE. Inside of the implementation the position is apparently synchronized.
Directory fsIndexDir = NIOFSDirectory.open(Paths.get(indexPath));
List<InfolisFile> files = new ArrayList<>();
for (String fileUri : getExecution().getInputFiles()) {
try {
files.add(this.getInputDataStoreClient().get(InfolisFile.class, fileUri));
} catch (BadRequestException | ProcessingException e) {
error(log, "Could not retrieve file " + fileUri + ": " + e.getMessage());
getExecution().setStatus(ExecutionStatus.FAILED);
fsIndexDir.close();
return;
}
}
Date start = new Date();
log.debug("Starting to index");
IndexWriter writer = new IndexWriter(fsIndexDir, indexWriterConfig);
try {
int counter = 0;
for (InfolisFile file : files) {
counter++;
log.trace("Indexing file " + file);
writer.addDocument(toLuceneDocument(getInputFileResolver(), file));
updateProgress(counter, files.size());
}
} catch (FileNotFoundException fnfe) {
// NOTE: at least on windows, some temporary files raise this
// exception with an "access denied" message checking if the
// file can be read doesn't help
throw new RuntimeException("Could not write index entry: " + fnfe);
} finally {
log.debug("Merging all Lucene segments ...");
writer.forceMerge(1);
writer.close();
fsIndexDir.close();
}
getExecution().setStatus(ExecutionStatus.FINISHED);
log.debug(String.format("Indexing %s documents took %s ms", files.size(), new Date().getTime() - start.getTime()));
}
@Override
public void validate() throws IllegalAlgorithmArgumentException {
Execution exec = this.getExecution();
if (null == exec.getInputFiles() || exec.getInputFiles().isEmpty()) {
throw new IllegalAlgorithmArgumentException(getClass(), "inputFiles", "missing or empty");
}
}
/**
* Files a lucene document. Documents are created as follows:
* <ol>
* <li>The path of the file is added as a field named "path". The field is
* indexed (i.e. searchable), but not tokenized into words.</li>
* <li>The last modified date of the file is added as a field named
* "modified". The field is indexed (i.e. searchable), not tokenized into
* words.</li>
* <li>The contents of the file are added to a field named "contents". A
* reader is specified so that the text of the file is tokenized and
* indexed, but not stored. Note that the file is expected to be in
* UTF-8 encoding. If that's not the case, searching for
* special characters will fail.</li>
* <li>Content (text files) is saved in the index along with position and
* offset information.</li>
* </ol>
*
* @param f a txt-file to be included in the lucene index
* @return a lucene document
* @throws IOException
*/
public static Document toLuceneDocument(FileResolver fileResolver, InfolisFile f) throws IOException {
InputStreamReader isr = new InputStreamReader(fileResolver.openInputStream(f), "UTF8");
BufferedReader reader = new BufferedReader(isr);
StringBuffer contents = new StringBuffer();
String text = null;
while ((text = reader.readLine()) != null) {
contents.append(text).append(System.getProperty("line.separator"));
}
reader.close();
isr.close();
text = new String(contents);
Document doc = new Document();
doc.add(new StringField("path", f.getUri(), Field.Store.YES));
doc.add(new StringField("fileName", f.getFileName(), Field.Store.YES));
// TODO kba: Add modified to InfolisFile
//doc.add(new LongField("modified", lastModified, Field.Store.NO));
// file is expected to be in UTF-8 encoding.
// If that's not the case searching for special characters will fail.
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field contentField = new Field("contents", text, offsetsType);
doc.add(contentField);
return doc;
}
}