package no.priv.garshol.duke.databases.es;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import no.priv.garshol.duke.Configuration;
import no.priv.garshol.duke.Database;
import no.priv.garshol.duke.DukeConfigException;
import no.priv.garshol.duke.DukeException;
import no.priv.garshol.duke.Property;
import no.priv.garshol.duke.Record;
import no.priv.garshol.duke.RecordImpl;
import no.priv.garshol.duke.utils.Utils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
import org.elasticsearch.action.admin.indices.create.CreateIndexResponse;
import org.elasticsearch.action.admin.indices.exists.indices.IndicesExistsRequest;
import org.elasticsearch.action.admin.indices.exists.indices.IndicesExistsResponse;
import org.elasticsearch.action.admin.indices.refresh.RefreshRequest;
import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.get.GetResponse;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.client.Client;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.collect.Iterables;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.index.query.QueryStringQueryBuilder;
import org.elasticsearch.node.Node;
import org.elasticsearch.node.NodeBuilder;
import org.elasticsearch.search.SearchHit;
public class ElasticSearchDatabase implements Database {
private static final int HOST_PORT_DEFAULT = 9300;
private static final String[] DATA_SUBDIRS = { "data", "work", "logs" };
private Configuration config;
private Property idProperty;
private boolean overwrite;
private Client client;
private Node node;
private String cluster;
private boolean clientOnly;
private boolean local;
private boolean clientSniff;
private StorageType storageType;
private String dataFolder;
private Collection<String> tAddresses;
private Analyzer analyzer;
private BulkRequestBuilder bulkRequest;
private int bulkRequestCounter;
private int bulkSize;
private String indexName;
private String indexType;
private int maxSearchHits;
public ElasticSearchDatabase() {
this.cluster = "duke-es";
// remote client defaults
this.clientSniff = true;
// local client/node defaults
this.storageType = StorageType.MEMORY;
this.clientOnly = false;
this.local = true;
// index and search stuff
this.indexName = "duke";
this.indexType = "record"; // TODO: figure out if we can do something
// smarter with this
this.maxSearchHits = 100;
this.bulkSize = 5000;
this.analyzer = new StandardAnalyzer();
}
private void init() {
this.setupConnection();
// Create index if it does not already exist
IndicesExistsResponse response = client.admin().indices()
.exists(new IndicesExistsRequest(indexName)).actionGet();
boolean forceCreate = false;
if (response.isExists() && !this.overwrite) {
client.admin().indices().prepareDelete(this.indexName).execute()
.actionGet();
forceCreate = true;
}
if (!response.isExists() || forceCreate) {
CreateIndexResponse create = client.admin().indices()
.prepareCreate(indexName).execute().actionGet();
try {
Thread.sleep(200);
} catch (InterruptedException e) {
throw new RuntimeException(
"Interrupted while waiting for index to settle in", e);
}
if (!create.isAcknowledged()) {
throw new IllegalArgumentException("Could not create index: "
+ indexName);
}
// create mapping
// XContentBuilder builder = null;
// try {
// builder =
// XContentFactory.jsonBuilder().startObject().startObject(this.indexType).startObject("properties");
// for (Property p : config.getProperties()) {
// if (!p.isIdProperty()) {
// // TODO: experiment similarity OKAPY BM25 for short
// // fields
// //
// (http://info.elasticsearch.com/rs/elasticsearch/images/What's%20new%20in%200.90%205-3-12.pdf)
// builder.startObject(p.getName()).field("type",
// "string").field("store", "yes").field("index", "analyzed")
// .endObject();
// }
// }
// builder.endObject().endObject().endObject();
// } catch (IOException e) {
// e.printStackTrace();
// }
// PutMappingResponse pmrb =
// client.admin().indices().preparePutMapping(this.indexName).setType(this.indexType).setSource(builder)
// .execute().actionGet();
}
// find id property
Collection<Property> identityProperties = this.config
.getIdentityProperties();
if (identityProperties == null || identityProperties.size() != 1) {
throw new java.lang.IllegalStateException(
"Unable to handle entities without single id");
}
this.idProperty = Iterables.get(identityProperties, 0);
// disable index refresh interval to improve indexing performance
// this is enabled back in commit()
ImmutableSettings.Builder indexSettings = ImmutableSettings
.settingsBuilder();
indexSettings.put("refresh_interval", -1);
this.client.admin().indices().prepareUpdateSettings(this.indexName)
.setSettings(indexSettings).execute().actionGet();
this.bulkRequest = this.client.prepareBulk();
}
private void setupConnection() {
ImmutableSettings.Builder settings = ImmutableSettings
.settingsBuilder();
settings.put("cluster.name", this.cluster);
if (this.tAddresses == null) {
NodeBuilder builder = NodeBuilder.nodeBuilder();
File dFolder = null;
if (this.dataFolder == null) {
dFolder = Utils.createTempDirectory("duke-es");
} else {
dFolder = new File(this.dataFolder);
if (!dFolder.exists()) {
dFolder.mkdirs();
}
}
System.out.println("ElasicSearch node folder " + dFolder);
for (String sub : DATA_SUBDIRS) {
String subdir = dFolder.getPath() + File.separator + sub;
File f = new File(subdir);
if (!f.exists()) {
f.mkdirs();
}
settings.put("path." + sub, subdir);
}
if (this.storageType == StorageType.MEMORY) {
settings.put("index.store.type", "memory");
}
builder.settings(settings.build());
this.node = builder.client(this.clientOnly).local(this.local)
.node();
this.client = this.node.client();
} else {
settings.put("client.transport.sniff", this.clientSniff);
this.client = new TransportClient(settings.build());
for (String address : this.tAddresses) {
String[] hostparts = address.split(":");
String hostname = hostparts[0];
int hostport = HOST_PORT_DEFAULT;
if (hostparts.length == 2) {
hostport = Integer.parseInt(hostparts[1]);
}
((TransportClient) client)
.addTransportAddress(new InetSocketTransportAddress(
hostname, hostport));
}
}
ClusterHealthResponse actionGet = this.client.admin().cluster()
.prepareHealth().setWaitForYellowStatus().execute().actionGet();
System.out.println("ElasticSearch Health Check " + actionGet);
}
/**
* Returns true if the ES index is held in memory rather than on disk.
*/
@Override
public boolean isInMemory() {
return this.storageType == StorageType.MEMORY;
}
/**
* Add the record to the index.
*/
@Override
public void index(Record record) {
if (this.client == null) {
this.init();
}
String id = null;
Map<String, Object> json = new HashMap<String, Object>();
for (String propname : record.getProperties()) {
Property prop = config.getPropertyByName(propname);
if (prop == null) {
throw new DukeConfigException("Record has property " + propname
+ " for which there is no configuration");
}
if (prop.isIdProperty()) {
id = record.getValue(propname);
} else {
Collection<String> values = record.getValues(propname);
if (values != null && !values.isEmpty()) {
if (values.size() == 1) {
json.put(propname, Iterables.get(values, 0));
} else {
json.put(propname, values);
}
}
}
}
this.addToIndex(id, json);
}
@Override
public void commit() {
if (this.client != null) {
this.flushIndex(true);
this.client.admin().indices()
.refresh(new RefreshRequest(this.indexName)).actionGet();
// enable index auto refresh
ImmutableSettings.Builder indexSettings = ImmutableSettings
.settingsBuilder();
indexSettings.put("refresh_interval", 1);
this.client.admin().indices().prepareUpdateSettings(this.indexName)
.setSettings(indexSettings).execute().actionGet();
this.client.admin().indices().prepareOptimize(this.indexName)
.setMaxNumSegments(5).execute().actionGet();
}
}
@Override
public Record findRecordById(String id) {
GetResponse getResponse = client
.prepareGet(this.indexName, this.indexType, id).execute()
.actionGet();
return this
.readFromSource(getResponse.getId(), getResponse.getSource());
}
@Override
public Collection<Record> findCandidateMatches(Record record) {
Collection<Record> records = new ArrayList<Record>();
BoolQueryBuilder bqb = QueryBuilders.boolQuery();
for (Property prop : config.getLookupProperties()) {
String propName = prop.getName();
boolean required = prop.getLookupBehaviour() == Property.Lookup.REQUIRED;
Collection<String> values = record.getValues(propName);
if (values == null) {
continue;
}
StringBuilder queryString = new StringBuilder();
for (String v : values) {
try {
TokenStream tokenStream = analyzer.tokenStream(propName,
new StringReader(v));
tokenStream.reset();
CharTermAttribute attr = tokenStream
.getAttribute(CharTermAttribute.class);
while (tokenStream.incrementToken()) {
queryString.append(attr.toString()).append(" ");
}
tokenStream.close();
} catch (IOException e) {
throw new DukeException("Error parsing input string '" + v
+ "' " + "in field " + propName);
}
}
Float boostFactor = this.getBoostFactor(prop.getHighProbability());
if (queryString.length() <= 0)
continue;
QueryStringQueryBuilder qsqb = QueryBuilders
.queryString(queryString.toString().trim())
.defaultField(propName).boost(boostFactor);
bqb = required ? bqb.must(qsqb) : bqb.should(qsqb);
}
SearchResponse response = this.client.prepareSearch(this.indexName)
.setTypes(this.indexType)
.setSearchType(SearchType.DFS_QUERY_THEN_FETCH).setQuery(bqb)
.setSize(this.maxSearchHits).execute().actionGet();
SearchHit[] results = response.getHits().getHits();
for (SearchHit hit : results) {
records.add(this.readFromSource(hit.getId(), hit.getSource()));
}
return records;
}
@Override
public void close() {
if (this.client != null) {
this.client.close();
this.client = null;
}
if (this.node != null && !this.node.isClosed()) {
this.node.close();
this.node = null;
}
}
@Override
public String toString() {
return "ElasticSearchDatabase [idProperty=" + idProperty
+ ", overwrite=" + overwrite + ", client=" + client + ", node="
+ node + ", cluster=" + cluster + ", clientOnly=" + clientOnly
+ ", local=" + local + ", clientSniff=" + clientSniff
+ ", storageType=" + storageType + ", dataFolder=" + dataFolder
+ ", tAddresses=" + tAddresses + ", bulkSize=" + bulkSize
+ ", indexName=" + indexName + ", indexType=" + indexType
+ ", maxSearchHits=" + maxSearchHits + "]";
}
private Record readFromSource(String id, Map<String, Object> source) {
RecordImpl record = null;
if (source != null) {
record = new RecordImpl();
// add the id first ...
record.addValue(this.idProperty.getName(), id);
// ... then the other fields
for (String key : source.keySet()) {
Object value = source.get(key);
if (value instanceof Collection<?>) {
for (Object v : (Collection<?>) value) {
record.addValue(key, v.toString());
}
} else {
record.addValue(key, value.toString());
}
}
}
return record;
}
private void addToIndex(String id, Map<String, Object> json) {
this.bulkRequest.add(this.client.prepareIndex(this.indexName,
this.indexType, id).setSource(json));
this.bulkRequestCounter++;
this.flushIndex(false);
}
private void flushIndex(boolean force) {
if ((force && this.bulkRequestCounter > 0)
|| this.bulkRequestCounter >= this.bulkSize) {
BulkResponse bulkResponse = this.bulkRequest.execute().actionGet();
if (bulkResponse.hasFailures()) {
throw new DukeException(bulkResponse.buildFailureMessage());
}
// reset bulk
this.bulkRequestCounter = 0;
this.bulkRequest = this.client.prepareBulk();
}
}
private Float getBoostFactor(double probability) {
return (float) Math.sqrt(1.0 / ((1.0 - probability) * 2.0));
}
public String getCluster() {
return cluster;
}
public void setCluster(String cluster) {
this.cluster = cluster;
}
public boolean isOverwrite() {
return overwrite;
}
@Override
public void setConfiguration(Configuration config) {
this.config = config;
}
@Override
public void setOverwrite(boolean overwrite) {
this.overwrite = overwrite;
}
public void setMaxSearchHits(int maxSearchHits) {
this.maxSearchHits = maxSearchHits;
}
public void setIndexName(String indexName) {
this.indexName = indexName;
}
}