BatchPercolatorService.java example

Explorer
elasticsearch-batch-percolator-master
- src
  - main
    - java
      - com
        meltwater
        elasticsearch
        action
        BatchPercolateAction.java
        BatchPercolateRequest.java
        BatchPercolateRequestBuilder.java
        BatchPercolateResponse.java
        BatchPercolateResponseItem.java
        BatchPercolateShardRequest.java
        BatchPercolateShardResponse.java
        BatchPercolateSourceBuilder.java
        TransportBatchPercolateAction.java
        index
        BatchPercolateException.java
        BatchPercolatorService.java
        RamDirectoryPercolatorIndex.java
        modules
        BatchPercolatorModule.java
        BatchPercolatorShardModule.java
        plugin
        BatchPercolatorPlugin.java
        rest
        RestBatchPercolateAction.java
        shard
        BatchPercolatorQueriesRegistry.java
        BatchPercolatorQueryException.java
        BatchQueriesLoaderCollector.java
        QueryAndSource.java
        QueryMatch.java
  - test
    - java
      - org
        elasticsearch
        test
        batchpercolator
        rest
        BatchPercolatorRestTest.java
        integration
        AbstractNodesTests.java
        ClusterTests.java
        ConcurrentPercolation.java
        RecoveryTests.java
        SimpleTests.java
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package com.meltwater.elasticsearch.index;

import com.meltwater.elasticsearch.action.BatchPercolateResponseItem;
import com.meltwater.elasticsearch.action.BatchPercolateShardRequest;
import com.meltwater.elasticsearch.action.BatchPercolateShardResponse;
import com.meltwater.elasticsearch.shard.BatchPercolatorQueriesRegistry;
import com.meltwater.elasticsearch.shard.QueryAndSource;
import com.meltwater.elasticsearch.shard.QueryMatch;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Counter;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.cache.recycler.CacheRecycler;
import org.elasticsearch.cache.recycler.PageCacheRecycler;
import org.elasticsearch.cluster.ClusterService;
import org.elasticsearch.cluster.action.index.MappingUpdatedAction;
import org.elasticsearch.common.base.Optional;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.collect.ImmutableMap;
import org.elasticsearch.common.collect.Maps;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.common.component.AbstractComponent;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.common.xcontent.*;
import org.elasticsearch.index.IndexService;
import org.elasticsearch.index.engine.Engine;
import org.elasticsearch.index.mapper.DocumentMapper;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.mapper.ParsedDocument;
import org.elasticsearch.index.query.ParsedQuery;
import org.elasticsearch.index.shard.IndexShard;
import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.indices.IndicesService;
import org.elasticsearch.script.ScriptService;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.SearchParseElement;
import org.elasticsearch.search.SearchParseException;
import org.elasticsearch.search.SearchShardTarget;
import org.elasticsearch.search.fetch.FetchPhase;
import org.elasticsearch.search.highlight.HighlightPhase;
import org.elasticsearch.search.internal.DefaultSearchContext;
import org.elasticsearch.search.internal.SearchContext;
import org.elasticsearch.search.internal.ShardSearchLocalRequest;
import org.elasticsearch.search.query.QueryPhase;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentMap;

/**
 * For each request, the {@link BatchPercolatorService#percolate} function is
 * called on each shard which contains percolation queries.
 *
 * The percolation is then executed in the following way:
 * 1. Create a RAMDirectory and index all documents in the request to it
 * 2. For each query:
 *  2.1 Parse highlighting options and set it to the search context (a class which is extremely stateful).
 *  2.2 Execute the query and collect results
 * 3. Send back the matches for each documents in a BatchPercolateShardResponse.
 */
public class BatchPercolatorService extends AbstractComponent {
    public final static String TYPE_NAME = ".batchpercolator";

    private final IndicesService indicesService;
    private final CacheRecycler cacheRecycler;
    private final PageCacheRecycler pageCacheRecycler;
    private final BigArrays bigArrays;
    private final ClusterService clusterService;

    private final HighlightPhase highlightPhase;
    private final ScriptService scriptService;
    private final MappingUpdatedAction mappingUpdatedAction;
    private QueryPhase queryPhase;
    private FetchPhase fetchPhase;

    @Inject
    public BatchPercolatorService(Settings settings, IndicesService indicesService, CacheRecycler cacheRecycler,
                                  PageCacheRecycler pageCacheRecycler, BigArrays bigArrays,
                                  HighlightPhase highlightPhase, ClusterService clusterService,
                                  ScriptService scriptService,
                                  MappingUpdatedAction mappingUpdatedAction,
                                  QueryPhase queryPhase,
                                  FetchPhase fetchPhase) {
        super(settings);
        this.indicesService = indicesService;
        this.cacheRecycler = cacheRecycler;
        this.pageCacheRecycler = pageCacheRecycler;
        this.bigArrays = bigArrays;
        this.clusterService = clusterService;
        this.highlightPhase = highlightPhase;
        this.scriptService = scriptService;
        this.mappingUpdatedAction = mappingUpdatedAction;
        this.queryPhase = queryPhase;
        this.fetchPhase = fetchPhase;
    }

    public BatchPercolateShardResponse percolate(BatchPercolateShardRequest request) throws IOException {
        long requestId = request.hashCode();
        IndexService percolateIndexService = indicesService.indexServiceSafe(request.shardId().getIndex());
        IndexShard indexShard = percolateIndexService.shardSafe(request.shardId().getId());
        indexShard.readAllowed(); // check if we can read the shard...
        ConcurrentMap<String, QueryAndSource> percolateQueries = percolateIndexService.shardInjectorSafe(indexShard.shardId().id())
                .getInstance(BatchPercolatorQueriesRegistry.class)
                .percolateQueries();

        List<ParsedDocument> parsedDocuments = parseRequest(percolateIndexService, request);
        if (percolateQueries.isEmpty()) {
            return new BatchPercolateShardResponse(emptyPeroclateResponses(parsedDocuments),
                    request.shardId().getIndex(), request.shardId().id());
        } else if (parsedDocuments == null) {
            throw new ElasticsearchIllegalArgumentException("Nothing to percolate");
        }

        // We use a RAMDirectory here instead of a MemoryIndex.
        // In our tests MemoryIndex had worse indexing performance for normal sized quiddities.
        RamDirectoryPercolatorIndex index = new RamDirectoryPercolatorIndex(indexShard.mapperService());
        Directory directory = index.indexDocuments(parsedDocuments);

        SearchContext context = createSearchContext(request, percolateIndexService, indexShard, directory);

        long filteringStart = System.currentTimeMillis();
        Map<String, QueryAndSource> filteredQueries = filterQueriesToSearchWith(percolateQueries, directory);

        logger.debug("{}-{} Percolation queries filtered down to '{}' items in '{}' ms'.",
                request.shardId(),
                requestId,
                filteredQueries.size(),
                System.currentTimeMillis() - filteringStart
        );
        //Perform the actual matching
        Map<String, BatchPercolateResponseItem> responses = percolateResponses(
                context, filteredQueries, parsedDocuments);

        directory.close();
        context.close();
        percolateIndexService.fixedBitSetFilterCache().clear("Done percolating "+requestId);
        percolateIndexService.fieldData().clear();
        percolateIndexService.cache().clear("Done percolating "+requestId);
        return new BatchPercolateShardResponse(responses, request.shardId().getIndex(), request.shardId().id());
    }

    private Map<String, QueryAndSource> filterQueriesToSearchWith(ConcurrentMap<String, QueryAndSource> percolateQueries, Directory directory) throws IOException {
        Map<String, QueryAndSource> filteredQueries = new HashMap<>();

        try(DirectoryReader reader = DirectoryReader.open(directory)){
            for(Map.Entry<String, QueryAndSource> entry:percolateQueries.entrySet()){
                try{
                    if(hasDocumentMatchingFilter(reader, entry.getValue().getLimitingFilter())){
                        filteredQueries.put(entry.getKey(), entry.getValue());
                    }
                } catch (Exception e){
                    logger.warn(
                            "Failed to pre-filter query. Assuming that it should be matched anyway. Query ID: {}, Filter: {}",
                            e, entry.getKey(), entry.getValue().getLimitingFilter());
                    filteredQueries.put(entry.getKey(), entry.getValue());
                }

            }
        }
        return filteredQueries;
    }

    private boolean hasDocumentMatchingFilter(IndexReader reader, Optional<Filter> optionalFilter) throws IOException {
        if(optionalFilter.isPresent()){
            Filter filter = optionalFilter.get();
            boolean found = false;
            // If you are not familiar with Lucene, this basically means that we try to
            // create an iterator for valid id:s for the filter for the given reader.
            // The filter and DocIdSet can both return null, to enable optimisations,
            // thus the null-checks. Null means that there were no matching docs, and
            // the same is true if the iterator refers to NO_MORE_DOCS immediately.
            for(AtomicReaderContext leaf:reader.leaves()) {
                DocIdSet idSet = filter.getDocIdSet(leaf, leaf.reader().getLiveDocs());
                if (idSet != null) {
                    DocIdSetIterator iter = idSet.iterator();
                    if (iter != null && iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                        found = true;
                        break;
                    }

                }
            }
            return found;
        }
        else{
            return true;
        }
    }

    private class DocSearcher extends Engine.Searcher {

        private DocSearcher(IndexSearcher searcher) {
            super("percolate", searcher);
        }

        @Override
        public void close() throws ElasticsearchException {
            try {
                this.reader().close();
            } catch (IOException e) {
                throw new ElasticsearchException("failed to close IndexReader in batch percolator", e);
            }
        }

    }

    private SearchContext createSearchContext(BatchPercolateShardRequest request,
                                              IndexService percolateIndexService,
                                              IndexShard indexShard,
                                              Directory directory) throws IOException {
        SearchShardTarget searchShardTarget = new SearchShardTarget(clusterService.localNode().id(),
                request.shardId().getIndex(), request.shardId().id());
        return new DefaultSearchContext(
                0,
                new ShardSearchLocalRequest(new ShardId("local_index",0), 0, SearchType.QUERY_AND_FETCH, null, null, false),
                searchShardTarget,
                new DocSearcher(new IndexSearcher(DirectoryReader.open(directory))),
                percolateIndexService,
                indexShard,
                scriptService,
                cacheRecycler,
                pageCacheRecycler,
                bigArrays,
                Counter.newCounter()
        );
    }

    private Map<String, BatchPercolateResponseItem> emptyPeroclateResponses(List<ParsedDocument> parsedDocuments) {
        Map<String, BatchPercolateResponseItem> items = Maps.newHashMap();
        for(ParsedDocument document : parsedDocuments){
            items.put(document.id(),
                    new BatchPercolateResponseItem(Maps.<String, QueryMatch>newHashMap(), document.id()));
        }
        return items;
    }

    private List<ParsedDocument> parseRequest(IndexService documentIndexService, BatchPercolateShardRequest request) throws ElasticsearchException {
        BytesReference source = request.source();
        if (source == null || source.length() == 0) {
            return null;
        }

        List<ParsedDocument> docs = new ArrayList<>();
        XContentParser parser = null;
        try {
            parser = XContentFactory.xContent(source).createParser(source);
            XContentParser.Token token;
            while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
                if (token == XContentParser.Token.FIELD_NAME) {
                    if("docs".equals(parser.currentName())){
                        docs.addAll(parsedDocuments(documentIndexService, request, parser));
                    }
                }
            }
        } catch (Throwable e) {
            throw new ElasticsearchParseException("failed to parse request", e);
        } finally {
            if (parser != null) {
                parser.close();
            }
        }

        return docs;
    }

    private List<ParsedDocument> parsedDocuments(IndexService documentIndexService,
                                                 BatchPercolateShardRequest request,
                                                 XContentParser parser) throws IOException {
        List<ParsedDocument> docs = new ArrayList<>();

        parser.nextToken();
        while ((parser.nextToken()) != XContentParser.Token.END_ARRAY) {
            MapperService mapperService = documentIndexService.mapperService();
            Tuple<DocumentMapper, Boolean> docMapperTuple = mapperService.documentMapperWithAutoCreate(request.documentType());

            BytesStreamOutput bStream = new BytesStreamOutput();
            XContentBuilder builder = XContentFactory.contentBuilder(XContentType.JSON, bStream);
            builder.copyCurrentStructure(parser);
            builder.close();
            docs.add(getParsedDocument(documentIndexService, request, docMapperTuple.v1(), bStream));
            bStream.close();
        }

        return docs;
    }

    private ParsedDocument getParsedDocument(IndexService documentIndexService,
                                             BatchPercolateShardRequest request,
                                             DocumentMapper docMapper,
                                             BytesStreamOutput bStream) {
        ParsedDocument doc = docMapper.parse(bStream.bytes());
        if (doc.mappingsModified()) {
            mappingUpdatedAction.updateMappingOnMaster(
                    request.shardId().getIndex(), docMapper,
                    documentIndexService.indexUUID()
            );
        }
        return doc;
    }

    private Map<String, BatchPercolateResponseItem> percolateResponses(SearchContext context, Map<String, QueryAndSource> percolateQueries, List<ParsedDocument> parsedDocuments) {
        Map<String, BatchPercolateResponseItem> responses = Maps.newHashMap();
        for(ParsedDocument document : parsedDocuments){
            responses.put(document.id(), new BatchPercolateResponseItem(document.id()));
        }

        for (Map.Entry<String, QueryAndSource> entry : percolateQueries.entrySet()) {
            try{
                executeSearch(context, entry.getValue());
                for (SearchHit searchHit  : context.fetchResult().hits()) {
                    String id = searchHit.getId();
                    BatchPercolateResponseItem batchPercolateResponseItem = responses.get(id);

                    QueryMatch queryMatch = getQueryMatch(entry, searchHit);
                    batchPercolateResponseItem.getMatches().put(queryMatch.getQueryId(), queryMatch);
                }
            }
            catch (Exception e){
                logger.warn(
                        "Failed to execute query. Will not add it to matches. Query ID: {}, Query: {}",
                        e, entry.getKey(), entry.getValue().getQuery());
            }
        }

        return responses;
    }

    private void executeSearch(SearchContext context, QueryAndSource queryAndSource) {
        parseHighlighting(context, queryAndSource.getSource());
        context.parsedQuery(new ParsedQuery(queryAndSource.getQuery(), ImmutableMap.<String, Filter>of()));
        if (context.from() == -1) {
            context.from(0);
        }
        if (context.size() == -1) {
            context.size(Integer.MAX_VALUE);
        }

        queryPhase.preProcess(context);
        fetchPhase.preProcess(context);

        queryPhase.execute(context);
        setDocIdsToLoad(context);
        fetchPhase.execute(context);
    }

    private QueryMatch getQueryMatch(Map.Entry<String, QueryAndSource> entry, SearchHit searchHit) {
        QueryMatch queryMatch = new QueryMatch();
        queryMatch.setQueryId(entry.getKey());
        queryMatch.setHighlighs(searchHit.highlightFields());
        return queryMatch;
    }

    private void setDocIdsToLoad(SearchContext context) {
        TopDocs topDocs = context.queryResult().topDocs();
        int totalSize = context.from() + context.size();
        int[] docIdsToLoad = new int[topDocs.totalHits];
        int counter = 0;
        for (int i = context.from(); i < totalSize; i++) {
            if (i < topDocs.scoreDocs.length) {
                docIdsToLoad[counter] = topDocs.scoreDocs[i].doc;
            } else {
                break;
            }
            counter++;
        }
        context.docIdsToLoad(docIdsToLoad, 0, counter);
    }

    //TODO do this when query is loaded into memory instead!
    private void parseHighlighting(SearchContext context, BytesReference source){
        XContentParser parser = null;
        Map<String, ? extends SearchParseElement> hlElements = highlightPhase.parseElements();
        try {
            parser = XContentFactory.xContent(source).createParser(source);
            XContentParser.Token token;
            while ((token = parser.nextToken()) != null) {
                if (token == XContentParser.Token.FIELD_NAME) {
                    String fieldName = parser.currentName();
                    parser.nextToken();
                    SearchParseElement element = hlElements.get(fieldName);
                    if (element != null) {
                        element.parse(parser, context);
                        break;
                    }
                }
            }
        } catch (Exception e) {
            String sSource = "_na_";
            try {
                sSource = XContentHelper.convertToJson(source, false);
            } catch (Throwable ignore) {}
            throw new SearchParseException(context, "Failed to parse source [" + sSource + "]", e);
        } finally {
            if (parser != null) {
                parser.close();
            }
        }
    }

}