/** * ElasticsearchClient * Copyright 18.02.2016 by Michael Peter Christen, @0rb1t3r * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see <http://www.gnu.org/licenses/>. */ package org.loklak.data; import java.io.File; import java.io.IOException; import java.net.InetAddress; import java.net.UnknownHostException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.util.AbstractMap; import java.util.ArrayList; import java.util.Calendar; import java.util.Collection; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import org.eclipse.jetty.util.log.Log; import org.elasticsearch.action.ActionWriteResponse; import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse; import org.elasticsearch.action.admin.cluster.stats.ClusterStatsAction; import org.elasticsearch.action.admin.cluster.stats.ClusterStatsNodes; import org.elasticsearch.action.admin.cluster.stats.ClusterStatsRequest; import org.elasticsearch.action.admin.cluster.stats.ClusterStatsRequestBuilder; import org.elasticsearch.action.admin.cluster.stats.ClusterStatsResponse; import org.elasticsearch.action.admin.cluster.tasks.PendingClusterTasksResponse; import org.elasticsearch.action.bulk.BulkItemResponse; import org.elasticsearch.action.bulk.BulkRequestBuilder; import org.elasticsearch.action.bulk.BulkResponse; import org.elasticsearch.action.delete.DeleteRequest; import org.elasticsearch.action.get.GetResponse; import org.elasticsearch.action.get.MultiGetItemResponse; import org.elasticsearch.action.get.MultiGetResponse; import org.elasticsearch.action.index.IndexResponse; import org.elasticsearch.action.search.SearchRequestBuilder; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.action.search.SearchType; import org.elasticsearch.client.Client; import org.elasticsearch.client.transport.TransportClient; import org.elasticsearch.cluster.health.ClusterHealthStatus; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.transport.InetSocketTransportAddress; import org.elasticsearch.common.unit.Fuzziness; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.index.VersionType; import org.elasticsearch.index.query.MatchQueryBuilder.Operator; import org.elasticsearch.index.query.MatchQueryBuilder.ZeroTermsQuery; import org.elasticsearch.index.query.BoolQueryBuilder; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.index.query.RangeQueryBuilder; import org.elasticsearch.node.Node; import org.elasticsearch.node.NodeBuilder; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.SearchHits; import org.elasticsearch.search.aggregations.AggregationBuilders; import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramInterval; import org.elasticsearch.search.aggregations.bucket.histogram.InternalHistogram; import org.elasticsearch.search.aggregations.bucket.terms.Terms; import org.elasticsearch.search.aggregations.bucket.terms.Terms.Bucket; import org.elasticsearch.search.sort.SortBuilders; import org.elasticsearch.search.sort.SortOrder; import org.joda.time.format.DateTimeFormatter; import org.joda.time.format.ISODateTimeFormat; import org.loklak.objects.ResultList; import org.loklak.tools.DateParser; public class ElasticsearchClient { private static long throttling_time_threshold = 2000L; // update time high limit private static long throttling_ops_threshold = 1000L; // messages per second low limit private static double throttling_factor = 1.0d; // factor applied on update duration if both thresholds are passed public final static BulkWriteResult EMPTY_BULK_RESULT = new BulkWriteResult(); private Node elasticsearchNode; private Client elasticsearchClient; /** * create a elasticsearch transport client (remote elasticsearch) * @param addresses an array of host:port addresses * @param clusterName */ public ElasticsearchClient(final String[] addresses, final String clusterName) { // create default settings and add cluster name Settings.Builder settings = Settings.builder() .put("cluster.name", clusterName) .put("cluster.routing.allocation.enable", "all") .put("cluster.routing.allocation.allow_rebalance", "true"); // create a client TransportClient tc = TransportClient.builder() .settings(settings.build()) .build(); for (String address: addresses) { String a = address.trim(); int p = a.indexOf(':'); if (p >= 0) try { InetAddress i = InetAddress.getByName(a.substring(0, p)); int port = Integer.parseInt(a.substring(p + 1)); tc.addTransportAddress(new InetSocketTransportAddress(i, port)); } catch (UnknownHostException e) { Log.getLog().warn(e); } } this.elasticsearchClient = tc; } /** * create a elasticsearch node client (embedded elasticsearch) * @param settings */ public ElasticsearchClient(final Settings.Builder settings) { // create a node this.elasticsearchNode = NodeBuilder.nodeBuilder().local(false).settings(settings).node(); // create a client this.elasticsearchClient = elasticsearchNode.client(); } public ClusterStatsNodes getClusterStatsNodes() { ClusterStatsRequest clusterStatsRequest = new ClusterStatsRequestBuilder(elasticsearchClient.admin().cluster(), ClusterStatsAction.INSTANCE).request(); ClusterStatsResponse clusterStatsResponse = elasticsearchClient.admin().cluster().clusterStats(clusterStatsRequest).actionGet(); ClusterStatsNodes clusterStatsNodes = clusterStatsResponse.getNodesStats(); return clusterStatsNodes; } private boolean clusterReadyCache = false; public boolean clusterReady() { if (clusterReadyCache) return true; ClusterHealthResponse chr = elasticsearchClient.admin().cluster().prepareHealth().get(); clusterReadyCache = chr.getStatus() != ClusterHealthStatus.RED; return clusterReadyCache; } public boolean wait_ready(long maxtimemillis, ClusterHealthStatus status) { // wait for yellow status long start = System.currentTimeMillis(); boolean is_ready; do { // wait for yellow status ClusterHealthResponse health = elasticsearchClient.admin().cluster().prepareHealth().setWaitForStatus(status).execute().actionGet(); is_ready = !health.isTimedOut(); if (!is_ready && System.currentTimeMillis() - start > maxtimemillis) return false; } while (!is_ready); return is_ready; } public void createIndexIfNotExists(String indexName, final int shards, final int replicas) { // create an index if not existent if (!this.elasticsearchClient.admin().indices().prepareExists(indexName).execute().actionGet().isExists()) { Settings.Builder settings = Settings.builder() .put("number_of_shards", shards) .put("number_of_replicas", replicas); this.elasticsearchClient.admin().indices().prepareCreate(indexName) .setSettings(settings) .setUpdateAllTypes(true) .execute().actionGet(); } else { //LOGGER.debug("Index with name {} already exists", indexName); } } public void setMapping(String indexName, XContentBuilder mapping) { try { this.elasticsearchClient.admin().indices().preparePutMapping(indexName) .setSource(mapping) .setUpdateAllTypes(true) .setType("_default_").execute().actionGet(); } catch (Throwable e) { Log.getLog().warn(e); }; } public void setMapping(String indexName, Map<String, Object> mapping) { try { this.elasticsearchClient.admin().indices().preparePutMapping(indexName) .setSource(mapping) .setUpdateAllTypes(true) .setType("_default_").execute().actionGet(); } catch (Throwable e) { Log.getLog().warn(e); }; } public void setMapping(String indexName, String mapping) { try { this.elasticsearchClient.admin().indices().preparePutMapping(indexName) .setSource(mapping) .setUpdateAllTypes(true) .setType("_default_").execute().actionGet(); } catch (Throwable e) { Log.getLog().warn(e); }; } public void setMapping(String indexName, File json) { try { this.elasticsearchClient.admin().indices().preparePutMapping(indexName) .setSource(new String(Files.readAllBytes(json.toPath()), StandardCharsets.UTF_8)) .setUpdateAllTypes(true) .setType("_default_") .execute() .actionGet(); } catch (Throwable e) { Log.getLog().warn(e); }; } public String pendingClusterTasks() { PendingClusterTasksResponse r = this.elasticsearchClient.admin().cluster().preparePendingClusterTasks().get(); return r.prettyPrint(); } public String clusterStats() { ClusterStatsResponse r = this.elasticsearchClient.admin().cluster().prepareClusterStats().get(); return r.toString(); } public Map<String, String> nodeSettings() { return this.elasticsearchClient.settings().getAsMap(); } /** * Close the connection to the remote elasticsearch client. This should only be called when the application is * terminated. * Please avoid to open and close the ElasticsearchClient for the same cluster and index more than once. * To avoid that this method is called more than once, the elasticsearch_client object is set to null * as soon this was called the first time. This is needed because the finalize method calls this * method as well. */ public void close() { if (this.elasticsearchClient != null) { this.elasticsearchClient.close(); this.elasticsearchClient = null; } if (this.elasticsearchNode != null) { this.elasticsearchNode.close(); this.elasticsearchNode = null; } } /** * A finalize method is added to ensure that close() is always called. */ public void finalize() { this.close(); // will not cause harm if this is the second call to close() } /** * Retrieve a statistic object from the connected elasticsearch cluster * * @return cluster stats from connected cluster */ public ClusterStatsNodes getStats() { final ClusterStatsRequest clusterStatsRequest = new ClusterStatsRequestBuilder(elasticsearchClient.admin().cluster(), ClusterStatsAction.INSTANCE) .request(); final ClusterStatsResponse clusterStatsResponse = elasticsearchClient.admin().cluster().clusterStats(clusterStatsRequest).actionGet(); final ClusterStatsNodes clusterStatsNodes = clusterStatsResponse.getNodesStats(); return clusterStatsNodes; } /** * Get the number of documents in the search index * * @return the count of all documents in the index */ public long count(String indexName) { return count(QueryBuilders.constantScoreQuery(QueryBuilders.matchAllQuery()), indexName); } /** * Get the number of documents in the search index for a given search query * * @param q * the query * @return the count of all documents in the index which matches with the query */ public long count(final QueryBuilder q, final String indexName) { SearchResponse response = elasticsearchClient.prepareSearch(indexName).setQuery(q).setSize(0).execute().actionGet(); return response.getHits().getTotalHits(); } public long count(final String index, final String histogram_timefield, final long millis) { try { SearchResponse response = elasticsearchClient.prepareSearch(index) .setSize(0) .setQuery(millis <= 0 ? QueryBuilders.constantScoreQuery(QueryBuilders.matchAllQuery()) : QueryBuilders.rangeQuery(histogram_timefield).from(new Date(System.currentTimeMillis() - millis))) .execute() .actionGet(); return response.getHits().getTotalHits(); } catch (Throwable e) { Log.getLog().warn(e); return 0; } } public long countLocal(final String index, final String provider_hash) { try { SearchResponse response = elasticsearchClient.prepareSearch(index) .setSize(0) .setQuery(QueryBuilders.matchQuery("provider_hash", provider_hash)) .execute() .actionGet(); return response.getHits().getTotalHits(); } catch (Throwable e) { Log.getLog().warn(e); return 0; } } /** * Get the document for a given id. If you don't know the typeName of the index, then it is not recommended * to use this method. You can set typeName to null and get the correct answer, but you still need the information * in which type the document was found if you want to call this API with the type afterwards. In such a case, * use the method getType() which returns null if the document does not exist and the type name if the document exist. * DO NOT USE THIS METHOD if you call getType anyway. I.e. replace a code like * if (exist(id()) { * String type = getType(id); * ... * } * with * String type = getType(id); * if (type != null) { * ... * } * * @param indexName * the name of the index * @param typeName * the type name, can be set to NULL for all types (see also: getType()) * @param id * the unique identifier of a document * @return the document, if it exists or null otherwise; */ public boolean exist(String indexName, String typeName, final String id) { GetResponse getResponse = elasticsearchClient .prepareGet(indexName, typeName, id) .setOperationThreaded(false) .setFields(new String[]{}) .execute() .actionGet(); return getResponse.isExists(); } public Set<String> existBulk(String indexName, String typeName, final Collection<String> ids) { if (ids == null || ids.size() == 0) return new HashSet<>(); MultiGetResponse multiGetItemResponses = elasticsearchClient.prepareMultiGet() .add(indexName, typeName, ids) .get(); Set<String> er = new HashSet<>(); for (MultiGetItemResponse itemResponse : multiGetItemResponses) { GetResponse response = itemResponse.getResponse(); if (response.isExists()) { er.add(response.getId()); } } return er; } /** * Get the type name of a document or null if the document does not exist. * This is a replacement of the exist() method which does exactly the same as exist() * but is able to return the type name in case that exist is successful. * Please read the comment to exist() for details. * @param indexName * the name of the index * @param id * the unique identifier of a document * @return the type name of the document if it exists, null otherwise */ public String getType(String indexName, final String id) { GetResponse getResponse = elasticsearchClient.prepareGet(indexName, null, id).execute().actionGet(); return getResponse.isExists() ? getResponse.getType() : null; } /** * Delete a document for a given id. * ATTENTION: deleted documents cannot be re-inserted again if version number * checking is used and the new document does not comply to the version number * rule. The information which document was deleted persists for one minute and * then inserting documents with the same version number as before is possible. * To modify this behavior, change the configuration setting index.gc_deletes * * @param id * the unique identifier of a document * @return true if the document existed and was deleted, false otherwise */ public boolean delete(String indexName, String typeName, final String id) { return elasticsearchClient.prepareDelete(indexName, typeName, id).execute().actionGet().isFound(); } /** * Delete a list of documents for a given set of ids * ATTENTION: read about the time-out of version number checking in the method above. * * @param ids * a map from the unique identifier of a document to the document type * @return the number of deleted documents */ public int deleteBulk(String indexName, Map<String, String> ids) { // bulk-delete the ids if (ids == null || ids.size() == 0) return 0; BulkRequestBuilder bulkRequest = elasticsearchClient.prepareBulk(); for (Map.Entry<String, String> id : ids.entrySet()) { bulkRequest.add(new DeleteRequest().id(id.getKey()).index(indexName).type(id.getValue())); } bulkRequest.execute().actionGet(); return ids.size(); } /** * Delete documents using a query. Check what would be deleted first with a normal search query! * Elasticsearch once provided a native prepareDeleteByQuery method, but this was removed * in later versions. Instead, there is a plugin which iterates over search results, * see https://www.elastic.co/guide/en/elasticsearch/plugins/current/plugins-delete-by-query.html * We simulate the same behaviour here without the need of that plugin. * * @param q * @return delete document count */ public int deleteByQuery(String indexName, final QueryBuilder q) { Map<String, String> ids = new TreeMap<>(); // FIXME: deprecated, "will be removed in 3.0, you should do a regular scroll instead, ordered by `_doc`" @SuppressWarnings("deprecation") SearchResponse response = elasticsearchClient.prepareSearch(indexName).setSearchType(SearchType.SCAN) .setScroll(new TimeValue(60000)).setQuery(q).setSize(100).execute().actionGet(); while (true) { // accumulate the ids here, don't delete them right now to prevent an interference of the delete with the // scroll for (SearchHit hit : response.getHits().getHits()) { ids.put(hit.getId(), hit.getType()); } response = elasticsearchClient.prepareSearchScroll(response.getScrollId()).setScroll(new TimeValue(600000)) .execute().actionGet(); // termination if (response.getHits().getHits().length == 0) break; } return deleteBulk(indexName, ids); } /** * Read a document from the search index for a given id. * This is the cheapest document retrieval from the '_source' field because * elasticsearch does not do any json transformation or parsing. We * get simply the text from the '_source' field. This might be useful to * make a dump from the index content. * * @param id * the unique identifier of a document * @return the document as source text */ public byte[] readSource(String indexName, final String id) { GetResponse response = elasticsearchClient.prepareGet(indexName, null, id).execute().actionGet(); return response.getSourceAsBytes(); } /** * Read a json document from the search index for a given id. * Elasticsearch reads the '_source' field and parses the content as json. * * @param id * the unique identifier of a document * @return the document as json, matched on a Map<String, Object> object instance */ public Map<String, Object> readMap(String indexName, final String id) { GetResponse response = elasticsearchClient.prepareGet(indexName, null, id).execute().actionGet(); Map<String, Object> map = getMap(response); return map; } protected static Map<String, Object> getMap(GetResponse response) { Map<String, Object> map = null; if (response.isExists() && (map = response.getSourceAsMap()) != null) { map.put("$type", response.getType()); } return map; } /** * Write a json document into the search index. * Writing using a XContentBuilder is the most efficient way to add content to elasticsearch * * @param jsonMap * the json document to be indexed in elasticsearch * @param id * the unique identifier of a document * @param indexName * the name of the index * @param typeName * the type of the index */ public IndexResponse writeSource(String indexName, XContentBuilder json, String id, String typeName, long version, VersionType versionType) { // put this to the index IndexResponse r = elasticsearchClient.prepareIndex(indexName, typeName, id).setSource(json) .setVersion(version).setVersionType(versionType).execute() .actionGet(); // documentation about the versioning is available at // https://www.elastic.co/blog/elasticsearch-versioning-support // TODO: error handling return r; } /** * Write a json document into the search index. The id must be calculated by the calling environment. * This id should be unique for the json. The best way to calculate this id is, to use an existing * field from the jsonMap which contains a unique identifier for the jsonMap. * * @param indexName * the name of the index * @param jsonMap * the json document to be indexed in elasticsearch * @param typeName * the type of the index * @param id * the unique identifier of a document * @return true if the document with given id did not exist before, false if it existed and was overwritten */ public boolean writeMap(String indexName, final Map<String, Object> jsonMap, String typeName, String id) { long start = System.currentTimeMillis(); // get the version number out of the json, if any is given Long version = (Long) jsonMap.remove("_version"); // put this to the index IndexResponse r = elasticsearchClient.prepareIndex(indexName, typeName, id).setSource(jsonMap) .setVersion(version == null ? 1 : version.longValue()) .setVersionType(version == null ? VersionType.FORCE : VersionType.EXTERNAL) .execute() .actionGet(); if (version != null) jsonMap.put("_version", version); // to prevent side effects // documentation about the versioning is available at // https://www.elastic.co/blog/elasticsearch-versioning-support // TODO: error handling boolean created = r.isCreated(); // true means created, false means updated long duration = Math.max(1, System.currentTimeMillis() - start); long regulator = 0; if (duration > throttling_time_threshold) { regulator = (long) (throttling_factor * duration); try {Thread.sleep(regulator);} catch (InterruptedException e) {} } Log.getLog().info("elastic write entry to index " + indexName + ": " + (created ? "created":"updated") + ", " + duration + " ms" + (regulator == 0 ? "" : ", throttled with " + regulator + " ms")); return created; } /** * bulk message write * @param jsonMapList * a list of json documents to be indexed * @param indexName * the name of the index * @param typeName * the type of the index * @return a list with error messages. * The key is the id of the document, the value is an error string. * The method was only successful if this list is empty. * This must be a list, because keys may appear several times. */ public BulkWriteResult writeMapBulk(final String indexName, final List<BulkEntry> jsonMapList) { long start = System.currentTimeMillis(); BulkRequestBuilder bulkRequest = elasticsearchClient.prepareBulk(); for (BulkEntry be: jsonMapList) { if (be.id == null) continue; bulkRequest.add( elasticsearchClient.prepareIndex(indexName, be.type, be.id).setSource(be.jsonMap) .setVersion(be.version == null ? 1 : be.version.longValue()) .setVersionType(be.version == null ? VersionType.FORCE : VersionType.EXTERNAL)); } BulkResponse bulkResponse = bulkRequest.get(); BulkWriteResult result = new BulkWriteResult(); for (BulkItemResponse r: bulkResponse.getItems()) { String id = r.getId(); ActionWriteResponse response = r.getResponse(); if (response instanceof IndexResponse) { if (((IndexResponse) response).isCreated()) result.created.add(id); } String err = r.getFailureMessage(); if (err != null) { result.errors.put(id, err); } } long duration = Math.max(1, System.currentTimeMillis() - start); long regulator = 0; long ops = result.created.size() * 1000 / duration; if (duration > throttling_time_threshold && ops < throttling_ops_threshold) { regulator = (long) (throttling_factor * duration); try {Thread.sleep(regulator);} catch (InterruptedException e) {} } Log.getLog().info("elastic write bulk to index " + indexName + ": " + jsonMapList.size() + " entries, " + result.created.size() + " created, " + result.errors.size() + " errors, " + duration + " ms" + (regulator == 0 ? "" : ", throttled with " + regulator + " ms") + ", " + ops + " objects/second"); return result; } public static class BulkWriteResult { private Map<String, String> errors; private Set<String> created; public BulkWriteResult() { this.errors = new LinkedHashMap<>(); this.created = new LinkedHashSet<>(); } public Map<String, String> getErrors() { return this.errors; } public Set<String> getCreated() { return this.created; } } private final static DateTimeFormatter utcFormatter = ISODateTimeFormat.dateTime().withZoneUTC(); public static class BulkEntry { private String id; private String type; private Long version; public Map<String, Object> jsonMap; /** * initialize entry for bulk writes * @param id the id of the entry * @param type the type name * @param timestamp_fieldname the name of the timestamp field, null for unused. If a name is given here, then this field is filled with the current time * @param version the version number >= 0 for external versioning or null for forced updates without versioning * @param jsonMap the payload object */ public BulkEntry(final String id, final String type, final String timestamp_fieldname, final Long version, final Map<String, Object> jsonMap) { this.id = id; this.type = type; this.version = version; this.jsonMap = jsonMap; if (timestamp_fieldname != null && !this.jsonMap.containsKey(timestamp_fieldname)) this.jsonMap.put(timestamp_fieldname, utcFormatter.print(System.currentTimeMillis())); } } /** * Query with a string and boundaries. * The string is supposed to be something that the user types in without a technical syntax. * The mapping of the search terms into the index can be different according * to a search type. Please see * https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html. * A better way to do this would be the usage of a cursor. * See the delete method to find out how cursors work. * * @param q * a search query string * @param operator * either AND or OR, the default operator for the query tokens * @param offset * the first document number, 0 is the first one * @param count * the number of documents to be returned * @return a list of json objects, mapped as Map<String,Object> for each json */ public List<Map<String, Object>> query(final String indexName, final String q, final Operator operator, final int offset, final int count) { assert count > 1; // for smaller amounts, use the next method SearchRequestBuilder request = elasticsearchClient.prepareSearch(indexName) // .addFields("_all") .setQuery(QueryBuilders.multiMatchQuery(q, "_all").operator(operator).zeroTermsQuery(ZeroTermsQuery.ALL)).setFrom(offset).setSize(count); SearchResponse response = request.execute().actionGet(); SearchHit[] hits = response.getHits().getHits(); ArrayList<Map<String, Object>> result = new ArrayList<Map<String, Object>>(); for (SearchHit hit : hits) { Map<String, Object> map = hit.getSource(); result.add(map); } return result; } public Map<String, Object> query(final String indexName, final String field_name, String field_value) { if (field_value == null || field_value.length() == 0) return null; // prepare request BoolQueryBuilder query = QueryBuilders.boolQuery(); query.filter(QueryBuilders.constantScoreQuery(QueryBuilders.termQuery(field_name, field_value))); SearchRequestBuilder request = elasticsearchClient.prepareSearch(indexName) .setSearchType(SearchType.QUERY_THEN_FETCH) .setQuery(query) .setFrom(0) .setSize(1).setTerminateAfter(1); // get response SearchResponse response = request.execute().actionGet(); // evaluate search result //long totalHitCount = response.getHits().getTotalHits(); SearchHit[] hits = response.getHits().getHits(); if (hits.length == 0) return null; assert hits.length == 1; Map<String, Object> map = hits[0].getSource(); return map; } public Query query(final String indexName, final QueryBuilder queryBuilder, String order_field, int timezoneOffset, int resultCount, long histogram_interval, String histogram_timefield, int aggregationLimit, String... aggregationFields) { return new Query(indexName, queryBuilder, order_field, timezoneOffset, resultCount, histogram_interval, histogram_timefield, aggregationLimit, aggregationFields); } public class Query { public List<Map<String, Object>> result; public int hitCount; public Map<String, List<Map.Entry<String, Long>>> aggregations; /** * Search the local message cache using a elasticsearch query. * @param q - the query, for aggregation this which should include a time frame in the form since:yyyy-MM-dd until:yyyy-MM-dd * @param order_field - the field to order the results, i.e. Timeline.Order.CREATED_AT * @param timezoneOffset - an offset in minutes that is applied on dates given in the query of the form since:date until:date * @param resultCount - the number of messages in the result; can be zero if only aggregations are wanted * @param dateHistogrammInterval - the date aggregation interval or null, if no aggregation wanted * @param aggregationLimit - the maximum count of facet entities, not search results * @param aggregationFields - names of the aggregation fields. If no aggregation is wanted, pass no (zero) field(s) */ public Query(final String indexName, final QueryBuilder queryBuilder, String order_field, int timezoneOffset, int resultCount, long histogram_interval, String histogram_timefield, int aggregationLimit, String... aggregationFields) { // prepare request SearchRequestBuilder request = elasticsearchClient.prepareSearch(indexName) .setSearchType(SearchType.QUERY_THEN_FETCH) .setQuery(queryBuilder) .setFrom(0) .setSize(resultCount); request.clearRescorers(); if (resultCount > 0) { request.addSort( SortBuilders.fieldSort(order_field) .unmappedType(order_field) .order(SortOrder.DESC) ); } boolean addTimeHistogram = false; DateHistogramInterval dateHistogrammInterval = histogram_interval > DateParser.WEEK_MILLIS ? DateHistogramInterval.DAY : histogram_interval > DateParser.HOUR_MILLIS * 3 ? DateHistogramInterval.HOUR : DateHistogramInterval.MINUTE; for (String field: aggregationFields) { if (field.equals(histogram_timefield)) { addTimeHistogram = true; request.addAggregation(AggregationBuilders.dateHistogram(histogram_timefield).field(histogram_timefield).timeZone("UTC").minDocCount(0).interval(dateHistogrammInterval)); } else { request.addAggregation(AggregationBuilders.terms(field).field(field).minDocCount(1).size(aggregationLimit)); } } // get response SearchResponse response = request.execute().actionGet(); hitCount = (int) response.getHits().getTotalHits(); // evaluate search result //long totalHitCount = response.getHits().getTotalHits(); SearchHit[] hits = response.getHits().getHits(); this.result = new ArrayList<Map<String, Object>>(hitCount); for (SearchHit hit: hits) { Map<String, Object> map = hit.getSource(); this.result.add(map); } // evaluate aggregation // collect results: fields this.aggregations = new HashMap<>(); for (String field: aggregationFields) { if (field.equals(histogram_timefield)) continue; // this has special handling below Terms fieldCounts = response.getAggregations().get(field); List<Bucket> buckets = fieldCounts.getBuckets(); // aggregate double-tokens (matching lowercase) Map<String, Long> checkMap = new HashMap<>(); for (Bucket bucket: buckets) { String key = bucket.getKeyAsString().trim(); if (key.length() > 0) { String k = key.toLowerCase(); Long v = checkMap.get(k); checkMap.put(k, v == null ? bucket.getDocCount() : v + bucket.getDocCount()); } } ArrayList<Map.Entry<String, Long>> list = new ArrayList<>(buckets.size()); for (Bucket bucket: buckets) { String key = bucket.getKeyAsString().trim(); if (key.length() > 0) { Long v = checkMap.remove(key.toLowerCase()); if (v == null) continue; list.add(new AbstractMap.SimpleEntry<String, Long>(key, v)); } } aggregations.put(field, list); //if (field.equals("place_country")) { // special handling of country aggregation: add the country center as well //} } // date histogram: if (addTimeHistogram) { InternalHistogram<InternalHistogram.Bucket> dateCounts = response.getAggregations().get(histogram_timefield); ArrayList<Map.Entry<String, Long>> list = new ArrayList<>(); for (InternalHistogram.Bucket bucket : dateCounts.getBuckets()) { Calendar cal = Calendar.getInstance(DateParser.UTCtimeZone); org.joda.time.DateTime k = (org.joda.time.DateTime) bucket.getKey(); cal.setTime(k.toDate()); cal.add(Calendar.MINUTE, -timezoneOffset); long docCount = bucket.getDocCount(); Map.Entry<String,Long> entry = new AbstractMap.SimpleEntry<String, Long>( (dateHistogrammInterval == DateHistogramInterval.DAY ? DateParser.dayDateFormat : DateParser.minuteDateFormat) .format(cal.getTime()), docCount); list.add(entry); } aggregations.put(histogram_timefield, list); } } } /** * Search the local message cache using a elasticsearch query. * @param q - the query, can be empty for a matchall-query * @param resultCount - the number of messages in the result * @param sort_field - the field name to sort the result list, i.e. "query_first" * @param sort_order - the sort order (you want to use SortOrder.DESC here) */ public ResultList<Map<String, Object>> fuzzyquery(final String indexName, final String fieldName, final String q, final int resultCount, final String sort_field, final String default_sort_type, final SortOrder sort_order, final Date since, final Date until, final String range_field) { // prepare request BoolQueryBuilder suggest = QueryBuilders.boolQuery(); if (q != null && q.length() > 0) { suggest.should(QueryBuilders.fuzzyQuery(fieldName, q).fuzziness(Fuzziness.fromEdits(2))); suggest.should(QueryBuilders.moreLikeThisQuery(fieldName).like(q)); suggest.should(QueryBuilders.matchPhrasePrefixQuery(fieldName, q)); if (q.indexOf('*') >= 0 || q.indexOf('?') >= 0) suggest.should(QueryBuilders.wildcardQuery(fieldName, q)); suggest.minimumNumberShouldMatch(1); } BoolQueryBuilder query; if (range_field != null && range_field.length() > 0 && (since != null || until != null)) { query = QueryBuilders.boolQuery(); if (q.length() > 0) query.filter(suggest); RangeQueryBuilder rangeQuery = QueryBuilders.rangeQuery(range_field); if (since != null) rangeQuery.from(since).includeLower(true); if (until != null) rangeQuery.to(until).includeUpper(true); query.filter(rangeQuery); } else { query = suggest; } SearchRequestBuilder request = this.elasticsearchClient.prepareSearch(indexName) .setSearchType(SearchType.QUERY_THEN_FETCH) .setQuery(query) .setFrom(0) .setSize(resultCount) .addSort( SortBuilders.fieldSort(sort_field) .unmappedType(default_sort_type) .order(sort_order)); // get response SearchResponse response = request.execute().actionGet(); // evaluate search result //long totalHitCount = response.getHits().getTotalHits(); SearchHits rhits = response.getHits(); //long totalHits = rhits.getTotalHits(); ResultList<Map<String, Object>> result = new ResultList<Map<String, Object>>(); SearchHit[] hits = rhits.getHits(); for (SearchHit hit: hits) { Map<String, Object> map = hit.getSource(); result.add(map); } result.setHits(rhits.getTotalHits()); return result; } public List<Map<String, Object>> queryWithConstraints(final String indexName, final String fieldName, final String fieldValue, final Map<String, String> constraints, boolean latest) throws IOException { SearchRequestBuilder request = this.elasticsearchClient.prepareSearch(indexName) .setSearchType(SearchType.QUERY_THEN_FETCH) .setFrom(0); BoolQueryBuilder bFilter = QueryBuilders.boolQuery(); bFilter.filter(QueryBuilders.constantScoreQuery(QueryBuilders.constantScoreQuery(QueryBuilders.termQuery(fieldName, fieldValue)))); for (Object o : constraints.entrySet()) { @SuppressWarnings("rawtypes") Map.Entry entry = (Map.Entry) o; bFilter.filter(QueryBuilders.constantScoreQuery(QueryBuilders.termQuery((String) entry.getKey(), ((String) entry.getValue()).toLowerCase()))); } request.setQuery(bFilter); // get response SearchResponse response = request.execute().actionGet(); // evaluate search result ArrayList<Map<String, Object>> result = new ArrayList<Map<String, Object>>(); SearchHit[] hits = response.getHits().getHits(); for (SearchHit hit: hits) { Map<String, Object> map = hit.getSource(); result.add(map); } return result; } public LinkedHashMap<String, Long> fullDateHistogram(final String indexName, int timezoneOffset, String histogram_timefield) { // prepare request SearchRequestBuilder request = elasticsearchClient.prepareSearch(indexName) .setSearchType(SearchType.QUERY_THEN_FETCH) .setQuery(QueryBuilders.constantScoreQuery(QueryBuilders.matchAllQuery())) .setFrom(0) .setSize(0); request.clearRescorers(); request.addAggregation(AggregationBuilders.dateHistogram(histogram_timefield).field(histogram_timefield).timeZone("UTC").minDocCount(1).interval(DateHistogramInterval.DAY)); // get response SearchResponse response = request.execute().actionGet(); // evaluate date histogram: InternalHistogram<InternalHistogram.Bucket> dateCounts = response.getAggregations().get(histogram_timefield); LinkedHashMap<String, Long> list = new LinkedHashMap<>(); for (InternalHistogram.Bucket bucket : dateCounts.getBuckets()) { Calendar cal = Calendar.getInstance(DateParser.UTCtimeZone); org.joda.time.DateTime k = (org.joda.time.DateTime) bucket.getKey(); cal.setTime(k.toDate()); cal.add(Calendar.MINUTE, -timezoneOffset); long docCount = bucket.getDocCount(); list.put(DateParser.dayDateFormat.format(cal.getTime()), docCount); } return list; } }