// SearchEvent.java
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 10.10.2005 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.search.query;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.ConcurrentModificationException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
import net.yacy.contentcontrol.ContentControlFilterUpdateThread;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.responsewriter.OpensearchResponseWriter;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.federate.yacy.Distribution;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue.Element;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.retrieval.Response;
import net.yacy.data.WorkTables;
import net.yacy.document.LargeNumberCache;
import net.yacy.document.LibraryProvider;
import net.yacy.document.ProbabilisticClassifier;
import net.yacy.document.Tokenizer;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceFactory;
import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.TermSearch;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.kelondro.util.ISO639;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.kelondro.util.SetTools;
import net.yacy.peers.RemoteSearch;
import net.yacy.peers.SeedDB;
import net.yacy.peers.graphics.ProfilingGraph;
import net.yacy.repository.FilterEngine;
import net.yacy.repository.LoaderDispatcher;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.EventTracker;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.index.Segment;
import net.yacy.search.navigator.Navigator;
import net.yacy.search.navigator.NavigatorPlugins;
import net.yacy.search.ranking.ReferenceOrder;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.search.snippet.TextSnippet;
import net.yacy.search.snippet.TextSnippet.ResultClass;
import org.apache.solr.common.SolrDocument;
public final class SearchEvent {
private static final int max_results_rwi = 3000;
private static final int max_results_node = 150;
/*
private static long noRobinsonLocalRWISearch = 0;
static {
try {
noRobinsonLocalRWISearch = GenericFormatter.FORMAT_SHORT_DAY.parse("20121107").getTime();
} catch (final ParseException e) {
}
}
*/
public final static ConcurrentLog log = new ConcurrentLog("SEARCH");
public static final int SNIPPET_MAX_LENGTH = 220;
private static final int MAX_TOPWORDS = 12; // default count of words for topicnavigagtor
private long eventTime;
public QueryParams query;
public final SeedDB peers;
final WorkTables workTables;
public final SecondarySearchSuperviser secondarySearchSuperviser;
public final List<RemoteSearch> primarySearchThreadsL;
public final List<Thread> nodeSearchThreads;
public Thread[] secondarySearchThreads;
public final SortedSet<byte[]> preselectedPeerHashes;
private final SortedMap<byte[], Integer> IACount;
private final SortedMap<byte[], String> IAResults;
private final SortedMap<byte[], HeuristicResult> heuristics;
private byte[] IAmaxcounthash, IAneardhthash;
public Thread rwiProcess;
public Thread localsolrsearch;
/** Offset of the next local Solr index request
* Example : last local request with offset=10 and itemsPerPage=20, sets this attribute to 30. */
private int localsolroffset;
private final AtomicInteger expectedRemoteReferences, maxExpectedRemoteReferences; // counter for referenced that had been sorted out for other reasons
public final ScoreMap<String> locationNavigator; // a counter for the appearance of location coordinates
public final ScoreMap<String> protocolNavigator; // a counter for protocol types
public final ScoreMap<String> dateNavigator; // a counter for file types
public final ScoreMap<String> languageNavigator; // a counter for appearance of languages
public final Map<String, ScoreMap<String>> vocabularyNavigator; // counters for Vocabularies; key is metatag.getVocabularyName()
private final int topicNavigatorCount; // if 0 no topicNavigator, holds expected number of terms for the topicNavigator
// map of search custom/configured search navigators in addition to above standard navigators (which use special handling or display forms)
public final Map<String, Navigator> navigatorPlugins; // map of active search navigators key=internal navigator name
private final LoaderDispatcher loader;
private final HandleSet snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets
private final boolean deleteIfSnippetFail;
private long urlRetrievalAllTime;
private long snippetComputationAllTime;
private ConcurrentHashMap<String, LinkedHashSet<String>> snippets;
private final boolean remote;
public final boolean addResultsToLocalIndex; // add received results to local index (defult=true)
/** Maximum size allowed (in kbytes) for a remote document result to be stored to local index */
private long remoteStoredDocMaxSize;
private SortedMap<byte[], ReferenceContainer<WordReference>> localSearchInclusion;
private final ScoreMap<String> ref; // reference score computation for the commonSense heuristic
private final long maxtime;
private final ConcurrentHashMap<String, WeakPriorityBlockingQueue<WordReferenceVars>> doubleDomCache; // key = domhash (6 bytes); value = like stack
private final int[] flagcount; // flag counter
private final AtomicInteger feedersAlive, feedersTerminated, snippetFetchAlive;
private boolean addRunning;
private final AtomicInteger receivedRemoteReferences;
private final ReferenceOrder order;
private final HandleSet urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
private final Map<String, String> taggingPredicates; // a map from tagging vocabulary names to tagging predicate uris
private final WeakPriorityBlockingQueue<WordReferenceVars> rwiStack; // thats the bag where the RWI search process writes to
private final WeakPriorityBlockingQueue<URIMetadataNode> nodeStack; // thats the bag where the solr results are written to
private final WeakPriorityBlockingQueue<URIMetadataNode> resultList; // thats the result list where the actual search result is waiting to be displayed
private final boolean pollImmediately; // if this is true, then every entry in result List is polled immediately to prevent a re-ranking in the resultList. This is usefull if there is only one index source.
public final boolean excludeintext_image;
// the following values are filled during the search process as statistics for the search
public final AtomicInteger local_rwi_available; // the number of hits generated/ranked by the local search in rwi index
public final AtomicInteger local_rwi_stored; // the number of existing hits by the local search in rwi index
public final AtomicInteger remote_rwi_available; // the number of hits imported from remote peers (rwi/solr mixed)
public final AtomicInteger remote_rwi_stored; // the number of existing hits at remote site
public final AtomicInteger remote_rwi_peerCount; // the number of peers which contributed to the remote search result
public final AtomicInteger local_solr_available; // the number of hits generated/ranked by the local search in solr
public final AtomicInteger local_solr_stored; // the number of existing hits by the local search in solr
public final AtomicInteger remote_solr_available;// the number of hits imported from remote peers (rwi/solr mixed)
public final AtomicInteger remote_solr_stored; // the number of existing hits at remote site
public final AtomicInteger remote_solr_peerCount;// the number of peers which contributed to the remote search result
public int getResultCount() {
return Math.max(
this.local_rwi_available.get() + this.remote_rwi_available.get() +
this.remote_solr_available.get() + this.local_solr_stored.get(),
imageViewed.size() + sizeSpare()
);
}
/**
* Set maximum size allowed (in kbytes) for a remote document result to be stored to local index.
* @param maxSize document content max size in kbytes. Zero or negative value means no limit.
*/
public void setRemoteDocStoredMaxSize(long maxSize) {
this.remoteStoredDocMaxSize = maxSize;
}
/**
* @return maximum size allowed (in kbytes) for a remote document result to be stored to local index.
* Zero or negative value means no limit.
*/
public long getRemoteDocStoredMaxSize() {
return this.remoteStoredDocMaxSize;
}
protected SearchEvent(
final QueryParams query,
final SeedDB peers,
final WorkTables workTables,
final SortedSet<byte[]> preselectedPeerHashes,
final boolean generateAbstracts,
final LoaderDispatcher loader,
final int remote_maxcount,
final long remote_maxtime,
final boolean deleteIfSnippetFail,
final boolean addResultsToLocalIdx) {
long ab = MemoryControl.available();
if (ab < 1024 * 1024 * 200) {
int eb = SearchEventCache.size();
SearchEventCache.cleanupEvents(false);
int en = SearchEventCache.size();
if (en < eb) {
log.info("Cleaned up search event cache (1) " + eb + "->" + en + ", " + (ab - MemoryControl.available()) / 1024 / 1024 + " MB freed");
}
}
ab = MemoryControl.available();
int eb = SearchEventCache.size();
SearchEventCache.cleanupEvents(Math.max(1, (int) (MemoryControl.available() / (1024 * 1024 * 120))));
int en = SearchEventCache.size();
if (en < eb) {
log.info("Cleaned up search event cache (2) " + eb + "->" + en + ", " + (ab - MemoryControl.available()) / 1024 / 1024 + " MB freed");
}
this.eventTime = System.currentTimeMillis(); // for lifetime check
this.peers = peers;
this.workTables = workTables;
this.query = query;
if(query != null) {
/* Image counter will eventually grow up faster than offset, but must start first with the same value as query offset */
this.imagePageCounter = query.offset;
}
this.loader = loader;
this.nodeStack = new WeakPriorityBlockingQueue<URIMetadataNode>(max_results_node, false);
this.maxExpectedRemoteReferences = new AtomicInteger(0);
this.expectedRemoteReferences = new AtomicInteger(0);
this.excludeintext_image = Switchboard.getSwitchboard().getConfigBool("search.excludeintext.image", true);
// prepare configured search navigation
final String navcfg = Switchboard.getSwitchboard().getConfig("search.navigation", "");
this.locationNavigator = navcfg.contains("location") ? new ConcurrentScoreMap<String>() : null;
this.protocolNavigator = navcfg.contains("protocol") ? new ConcurrentScoreMap<String>() : null;
this.dateNavigator = navcfg.contains("date") ? new ClusteredScoreMap<String>(true) : null;
this.topicNavigatorCount = navcfg.contains("topics") ? MAX_TOPWORDS : 0;
this.languageNavigator = navcfg.contains("language") ? new ConcurrentScoreMap<String>() : null;
this.vocabularyNavigator = new TreeMap<String, ScoreMap<String>>();
// prepare configured search navigation (plugins)
this.navigatorPlugins = NavigatorPlugins.initFromCfgString(navcfg);
this.snippets = new ConcurrentHashMap<String, LinkedHashSet<String>>();
this.secondarySearchSuperviser = (this.query.getQueryGoal().getIncludeHashes().size() > 1) ? new SecondarySearchSuperviser(this) : null; // generate abstracts only for combined searches
if (this.secondarySearchSuperviser != null) this.secondarySearchSuperviser.start();
this.secondarySearchThreads = null;
this.preselectedPeerHashes = preselectedPeerHashes;
this.IAResults = new TreeMap<byte[], String>(Base64Order.enhancedCoder);
this.IACount = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
this.heuristics = new TreeMap<byte[], HeuristicResult>(Base64Order.enhancedCoder);
this.IAmaxcounthash = null;
this.IAneardhthash = null;
this.remote = (peers != null && peers.sizeConnected() > 0) && (this.query.domType == QueryParams.Searchdom.CLUSTER || (this.query.domType == QueryParams.Searchdom.GLOBAL && Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW_SEARCH, false)));
this.addResultsToLocalIndex = addResultsToLocalIdx;
/* Défault : no size limit to store remote result documents to local index. Use setter to eventually modify it. */
this.remoteStoredDocMaxSize = -1;
this.local_rwi_available = new AtomicInteger(0); // the number of results in the local peer after filtering
this.local_rwi_stored = new AtomicInteger(0);
this.local_solr_available = new AtomicInteger(0);
this.local_solr_stored = new AtomicInteger(0);
this.remote_rwi_stored = new AtomicInteger(0);
this.remote_rwi_available = new AtomicInteger(0); // the number of result contributions from all the remote dht peers
this.remote_rwi_peerCount = new AtomicInteger(0); // the number of remote dht peers that have contributed
this.remote_solr_stored = new AtomicInteger(0);
this.remote_solr_available= new AtomicInteger(0); // the number of result contributions from all the remote solr peers
this.remote_solr_peerCount= new AtomicInteger(0); // the number of remote solr peers that have contributed
final long start = System.currentTimeMillis();
// do a soft commit for fresh results
//query.getSegment().fulltext().commit(true);
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
// sortorder: 0 = hash, 1 = url, 2 = ranking
this.localSearchInclusion = null;
this.ref = new ConcurrentScoreMap<String>();
this.maxtime = query.maxtime;
this.rwiStack = new WeakPriorityBlockingQueue<WordReferenceVars>(max_results_rwi, false);
this.doubleDomCache = new ConcurrentHashMap<String, WeakPriorityBlockingQueue<WordReferenceVars>>();
this.flagcount = new int[32];
for ( int i = 0; i < 32; i++ ) {
this.flagcount[i] = 0;
}
this.feedersAlive = new AtomicInteger(0);
this.feedersTerminated = new AtomicInteger(0);
this.snippetFetchAlive = new AtomicInteger(0);
this.addRunning = true;
this.receivedRemoteReferences = new AtomicInteger(0);
this.order = new ReferenceOrder(this.query.ranking, this.query.targetlang);
this.urlhashes = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 100);
this.taggingPredicates = new HashMap<String, String>();
for (Tagging t: LibraryProvider.autotagging.getVocabularies()) {
this.taggingPredicates.put(t.getName(), t.getPredicate());
}
// start a local solr search
if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_SOLR_OFF, false)) {
this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, true, this.excludeintext_image), this.query.offset, this.query.itemsPerPage, null /*this peer*/, 0, Switchboard.urlBlacklist);
}
this.localsolroffset = this.query.offset + this.query.itemsPerPage;
// start a local RWI search concurrently
this.rwiProcess = null;
if (query.getSegment().connectedRWI() && !Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_DHT_OFF, false)) {
// we start the local search only if this peer is doing a remote search or when it is doing a local search and the peer is old
rwiProcess = new RWIProcess(this.localsolrsearch);
rwiProcess.start();
}
if (this.remote) {
// start global searches
this.pollImmediately = false;
final long timer = System.currentTimeMillis();
if (this.query.getQueryGoal().getIncludeHashes().isEmpty()) {
this.primarySearchThreadsL = null;
this.nodeSearchThreads = null;
} else {
this.primarySearchThreadsL = new ArrayList<RemoteSearch>();
this.nodeSearchThreads = new ArrayList<Thread>();
// start this concurrently because the remote search needs an enumeration
// of the remote peers which may block in some cases when i.e. DHT is active
// at the same time.
new Thread() {
@Override
public void run() {
this.setName("SearchEvent.init(" + query.getQueryGoal().getQueryString(false) + ")");
Thread.currentThread().setName("SearchEvent.primaryRemoteSearches");
RemoteSearch.primaryRemoteSearches(
SearchEvent.this,
0, remote_maxcount,
remote_maxtime,
Switchboard.urlBlacklist,
(SearchEvent.this.query.domType == QueryParams.Searchdom.GLOBAL) ? null : preselectedPeerHashes);
}
}.start();
}
if ( this.primarySearchThreadsL != null ) {
ConcurrentLog.fine("SEARCH_EVENT", "STARTING "
+ this.primarySearchThreadsL.size()
+ " THREADS TO CATCH EACH "
+ remote_maxcount
+ " URLs");
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEventType.REMOTESEARCH_START, "", this.primarySearchThreadsL.size(), System.currentTimeMillis() - timer), false);
// finished searching
ConcurrentLog.fine("SEARCH_EVENT", "SEARCH TIME AFTER GLOBAL-TRIGGER TO " + this.primarySearchThreadsL.size() + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
} else {
// no search since query is empty, user might have entered no data or filters have removed all search words
ConcurrentLog.fine("SEARCH_EVENT", "NO SEARCH STARTED DUE TO EMPTY SEARCH REQUEST.");
}
} else {
this.primarySearchThreadsL = null;
this.nodeSearchThreads = null;
this.pollImmediately = !query.getSegment().connectedRWI() || !Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW_SEARCH, false);
if ( generateAbstracts ) {
// we need the results now
try {
if (rwiProcess != null && query.getSegment().connectedRWI()) rwiProcess.join();
} catch (final Throwable e ) {
}
// compute index abstracts
final long timer = System.currentTimeMillis();
int maxcount = -1;
long mindhtdistance = Long.MAX_VALUE, l;
byte[] wordhash;
assert !query.getSegment().connectedRWI() || this.searchContainerMap() != null;
if (this.searchContainerMap() != null) {
for (final Map.Entry<byte[], ReferenceContainer<WordReference>> entry : this.searchContainerMap().entrySet()) {
wordhash = entry.getKey();
final ReferenceContainer<WordReference> container = entry.getValue();
assert (Base64Order.enhancedCoder.equal(container.getTermHash(), wordhash)) : "container.getTermHash() = " + ASCII.String(container.getTermHash()) + ", wordhash = " + ASCII.String(wordhash);
if ( container.size() > maxcount ) {
this.IAmaxcounthash = wordhash;
maxcount = container.size();
}
l = Distribution.horizontalDHTDistance(wordhash, ASCII.getBytes(peers.mySeed().hash));
if ( l < mindhtdistance ) {
// calculate the word hash that is closest to our dht position
mindhtdistance = l;
this.IAneardhthash = wordhash;
}
this.IACount.put(wordhash, LargeNumberCache.valueOf(container.size()));
this.IAResults.put(wordhash, WordReferenceFactory.compressIndex(container, null, 1000).toString());
}
}
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEventType.ABSTRACTS, "", this.searchContainerMap() == null ? 0 : this.searchContainerMap().size(), System.currentTimeMillis() - timer), false);
} else {
// give process time to accumulate a certain amount of data
// before a reading process wants to get results from it
try {
if (rwiProcess != null && query.getSegment().connectedRWI() && rwiProcess.isAlive()) rwiProcess.join(100);
} catch (final Throwable e ) {
}
// this will reduce the maximum waiting time until results are available to 100 milliseconds
// while we always get a good set of ranked data
}
}
// start worker threads to fetch urls and snippets
this.deleteIfSnippetFail = deleteIfSnippetFail;
this.urlRetrievalAllTime = 0;
this.snippetComputationAllTime = 0;
this.resultList = new WeakPriorityBlockingQueue<URIMetadataNode>(Math.max(max_results_node, 10 * query.itemsPerPage()), true); // this is the result, enriched with snippets, ranked and ordered by ranking
// snippets do not need to match with the complete query hashes,
// only with the query minus the stopwords which had not been used for the search
boolean filtered = false;
// check if query contains stopword
if (Switchboard.stopwordHashes != null) {
Iterator<byte[]> it = query.getQueryGoal().getIncludeHashes().iterator();
while (it.hasNext()) {
if (Switchboard.stopwordHashes.contains((it.next()))) {
filtered = true;
break;
}
}
}
this.snippetFetchWordHashes = query.getQueryGoal().getIncludeHashes().clone();
if (filtered) { // remove stopwords
this.snippetFetchWordHashes.excludeDestructive(Switchboard.stopwordHashes);
}
// clean up events
SearchEventCache.cleanupEvents(false);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEventType.CLEANUP, "", 0, 0), false);
// store this search to a cache so it can be re-used
if ( MemoryControl.available() < 1024 * 1024 * 100 ) {
SearchEventCache.cleanupEvents(false);
}
SearchEventCache.put(this.query.id(false), this);
}
private class RWIProcess extends Thread {
final Thread waitForThread;
public RWIProcess(final Thread waitForThread) {
super("SearchEvent.RWIProcess(" + (waitForThread != null ? waitForThread.getName() : "") + ")");
this.waitForThread = waitForThread;
}
@Override
public void run() {
if (query.getSegment().termIndex() == null) return; // nothing to do; this index is not used
// do a search
oneFeederStarted();
// sort the local containers and truncate it to a limited count,
// so following sortings together with the global results will be fast
try {
final long timer = System.currentTimeMillis();
TermSearch<WordReference> search =
SearchEvent.this.query
.getSegment()
.termIndex()
.query(
SearchEvent.this.query.getQueryGoal().getIncludeHashes(),
SearchEvent.this.query.getQueryGoal().getExcludeHashes(),
null,
Segment.wordReferenceFactory,
SearchEvent.this.query.maxDistance);
SearchEvent.this.localSearchInclusion = search.inclusion();
ReferenceContainer<WordReference> index = search.joined();
if ( !index.isEmpty() ) {
// in case that another thread has priority for their results, wait until this is finished
if (this.waitForThread != null && this.waitForThread.isAlive()) {
this.waitForThread.join();
}
// add the index to the result
int successcount = addRWIs(index, true, "local index: " + SearchEvent.this.query.getSegment().getLocation(), index.size(), SearchEvent.this.maxtime);
if (successcount == 0 &&
SearchEvent.this.query.getQueryGoal().getIncludeHashes().has(Segment.catchallHash) &&
SearchEvent.this.query.modifier.sitehost != null && SearchEvent.this.query.modifier.sitehost.length() > 0
) {
// try again with sitehost
String newGoal = Domains.getSmartSLD(SearchEvent.this.query.modifier.sitehost);
search =
SearchEvent.this.query
.getSegment()
.termIndex()
.query(
QueryParams.hashes2Set(ASCII.String(Word.word2hash(newGoal))),
SearchEvent.this.query.getQueryGoal().getExcludeHashes(),
null,
Segment.wordReferenceFactory,
SearchEvent.this.query.maxDistance);
SearchEvent.this.localSearchInclusion = search.inclusion();
index = search.joined();
if (!index.isEmpty()) {
successcount = addRWIs(index, true, "local index: " + SearchEvent.this.query.getSegment().getLocation(), index.size(), SearchEvent.this.maxtime);
}
}
EventTracker.update(
EventTracker.EClass.SEARCH,
new ProfilingGraph.EventSearch(
SearchEvent.this.query.id(true),
SearchEventType.JOIN,
SearchEvent.this.query.getQueryGoal().getQueryString(false),
successcount,
System.currentTimeMillis() - timer),
false);
SearchEvent.this.addFinalize();
}
} catch (final Exception e ) {
ConcurrentLog.logException(e);
} finally {
oneFeederTerminated();
}
}
}
public int addRWIs(
final ReferenceContainer<WordReference> index,
final boolean local,
final String resourceName,
final int fullResource,
final long maxtime) {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
//Log.logInfo("SearchEvent", "added a container, size = " + index.size());
this.addRunning = true;
assert (index != null);
if (index.isEmpty()) return 0;
if (local) {
assert fullResource >= 0 : "fullResource = " + fullResource;
this.local_rwi_stored.addAndGet(fullResource);
} else {
assert fullResource >= 0 : "fullResource = " + fullResource;
this.remote_rwi_stored.addAndGet(fullResource);
this.remote_rwi_peerCount.incrementAndGet();
}
long timer = System.currentTimeMillis();
// normalize entries
final BlockingQueue<WordReferenceVars> decodedEntries = this.order.normalizeWith(index, maxtime, local);
int is = index.size();
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(
this.query.id(true),
SearchEventType.NORMALIZING,
resourceName,
is,
System.currentTimeMillis() - timer), false);
if (!local) this.receivedRemoteReferences.addAndGet(is);
// iterate over normalized entries and select some that are better than currently stored
timer = System.currentTimeMillis();
// apply all constraints
long timeout = maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime;
int successcounter = 0;
try {
WordReferenceVars iEntry;
long remaining;
String acceptableAlternativeSitehash = null;
if (this.query.modifier.sitehost != null && this.query.modifier.sitehost.length() > 0) try {
acceptableAlternativeSitehash = DigestURL.hosthash(this.query.modifier.sitehost.startsWith("www.") ? this.query.modifier.sitehost.substring(4) : "www." + this.query.modifier.sitehost, 80);
} catch (MalformedURLException e1) {}
pollloop: while ( true ) {
remaining = timeout - System.currentTimeMillis();
if (remaining <= 0) {
ConcurrentLog.warn("SearchEvent", "terminated 'add' loop before poll time-out = " + remaining + ", decodedEntries.size = " + decodedEntries.size());
break;
}
iEntry = decodedEntries.poll(remaining, TimeUnit.MILLISECONDS);
if (iEntry == null) {
ConcurrentLog.warn("SearchEvent", "terminated 'add' loop after poll time-out = " + remaining + ", decodedEntries.size = " + decodedEntries.size());
break pollloop;
}
if (iEntry == WordReferenceVars.poison) {
break pollloop;
}
assert (iEntry.urlhash().length == index.row().primaryKeyLength);
// doublecheck for urls
if (this.urlhashes.has(iEntry.urlhash())) {
if (log.isFine()) log.fine("dropped RWI: doublecheck");
continue pollloop;
}
// increase flag counts
Bitfield flags = iEntry.flags();
for (int j = 0; j < 32; j++) {
if (flags.get(j)) this.flagcount[j]++;
}
// check constraints
if (!this.testFlags(flags)) {
if (log.isFine()) log.fine("dropped RWI: flag test failed");
continue pollloop;
}
// check document domain
if (this.query.contentdom.getCode() > 0 &&
((this.query.contentdom == ContentDomain.AUDIO && !(flags.get(Tokenizer.flag_cat_hasaudio))) ||
(this.query.contentdom == ContentDomain.VIDEO && !(flags.get(Tokenizer.flag_cat_hasvideo))) ||
(this.query.contentdom == ContentDomain.IMAGE && !(flags.get(Tokenizer.flag_cat_hasimage))) ||
(this.query.contentdom == ContentDomain.APP && !(flags.get(Tokenizer.flag_cat_hasapp))))) {
if (log.isFine()) log.fine("dropped RWI: contentdom fail");
continue pollloop;
}
// count domZones
//this.domZones[DigestURI.domDomain(iEntry.metadataHash())]++;
// check site constraints
final String hosthash = iEntry.hosthash();
if ( this.query.modifier.sitehash == null ) {
if (this.query.siteexcludes != null && this.query.siteexcludes.contains(hosthash)) {
if (log.isFine()) log.fine("dropped RWI: siteexcludes");
continue pollloop;
}
} else {
// filter out all domains that do not match with the site constraint
if (!hosthash.equals(this.query.modifier.sitehash) && (acceptableAlternativeSitehash == null || !hosthash.equals(acceptableAlternativeSitehash))) {
if (log.isFine()) log.fine("dropped RWI: modifier.sitehash");
continue pollloop;
}
}
// finally extend the double-check and insert result to stack
this.urlhashes.putUnique(iEntry.urlhash());
rankingtryloop: while (true) {
try {
this.rwiStack.put(new ReverseElement<WordReferenceVars>(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest)
break rankingtryloop;
} catch (final ArithmeticException e ) {
// this may happen if the concurrent normalizer changes values during cardinal computation
if (log.isFine()) log.fine("dropped RWI: arithmetic exception");
continue rankingtryloop;
}
}
// increase counter for statistics
if (local) this.local_rwi_available.incrementAndGet(); else this.remote_rwi_available.incrementAndGet();
successcounter++;
}
if (System.currentTimeMillis() >= timeout) ConcurrentLog.warn("SearchEvent", "rwi normalization ended with timeout = " + maxtime);
} catch (final InterruptedException e ) {
} catch (final SpaceExceededException e ) {
}
//if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(
this.query.id(true),
SearchEventType.PRESORT,
resourceName,
index.size(),
System.currentTimeMillis() - timer), false);
return successcounter;
}
public long getEventTime() {
return this.eventTime;
}
protected void resetEventTime() {
this.eventTime = System.currentTimeMillis();
}
protected void cleanup() {
// stop all threads
if (this.localsolrsearch != null) {
if (localsolrsearch.isAlive()) synchronized (this.localsolrsearch) {this.localsolrsearch.interrupt();}
}
if (this.nodeSearchThreads != null) {
for (final Thread search : this.nodeSearchThreads) {
if (search != null) {
synchronized (search) {if (search.isAlive()) {search.interrupt();}}
}
}
}
if (this.primarySearchThreadsL != null) {
for (final RemoteSearch search : this.primarySearchThreadsL) {
if (search != null) {
synchronized (search) {if (search.isAlive()) {search.interrupt();}}
}
}
}
if (this.secondarySearchThreads != null) {
for (final Thread search : this.secondarySearchThreads ) {
if (search != null) {
synchronized (search) {if (search.isAlive()) {search.interrupt();}}
}
}
}
// clear all data structures
if (this.preselectedPeerHashes != null) this.preselectedPeerHashes.clear();
if (this.IACount != null) this.IACount.clear();
if (this.IAResults != null) this.IAResults.clear();
if (this.heuristics != null) this.heuristics.clear();
this.rwiStack.clear();
this.nodeStack.clear();
this.resultList.clear();
}
public String abstractsString(final byte[] hash) {
return this.IAResults.get(hash);
}
public Iterator<Map.Entry<byte[], Integer>> abstractsCount() {
return this.IACount.entrySet().iterator();
}
public int abstractsCount(final byte[] hash) {
final Integer i = this.IACount.get(hash);
if ( i == null ) {
return -1;
}
return i.intValue();
}
public byte[] getAbstractsMaxCountHash() {
return this.IAmaxcounthash;
}
public byte[] getAbstractsNearDHTHash() {
return this.IAneardhthash;
}
public List<RemoteSearch> getPrimarySearchThreads() {
return this.primarySearchThreadsL;
}
public Thread[] getSecondarySearchThreads() {
return this.secondarySearchThreads;
}
public void addHeuristic(final byte[] urlhash, final String heuristicName, final boolean redundant) {
synchronized ( this.heuristics ) {
this.heuristics.put(urlhash, new HeuristicResult(urlhash, heuristicName, redundant));
}
}
public HeuristicResult getHeuristic(final byte[] urlhash) {
synchronized ( this.heuristics ) {
return this.heuristics.get(urlhash);
}
}
public void addNodes(
final List<URIMetadataNode> nodeList,
final Map<String, ReversibleScoreMap<String>> facets, // a map from a field name to scored values
final Map<String, LinkedHashSet<String>> solrsnippets, // a map from urlhash to snippet text
final boolean local,
final String resourceName,
final int fullResource) {
this.addBegin();
// check if all results have snippets
/*
for (URIMetadataNode node: nodeList) {
if (!facets.containsKey(ASCII.String(node.hash()))) {
log.logInfo("no snippet from Solr for " + node.url().toNormalform(true));
}
}
*/
this.snippets.putAll(solrsnippets);
assert (nodeList != null);
if (nodeList.isEmpty()) return;
if (local) {
this.local_solr_stored.set(fullResource);
} else {
assert fullResource >= 0 : "fullResource = " + fullResource;
this.remote_solr_stored.addAndGet(fullResource);
this.remote_solr_peerCount.incrementAndGet();
}
long timer = System.currentTimeMillis();
// normalize entries
int is = nodeList.size();
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEventType.NORMALIZING, resourceName, is, System.currentTimeMillis() - timer), false);
if (!local) {
this.receivedRemoteReferences.addAndGet(is);
}
// iterate over normalized entries and select some that are better than currently stored
timer = System.currentTimeMillis();
// collect navigation information
// iterate over active navigator plugins to let them update the counters
for (String s : this.navigatorPlugins.keySet()) {
Navigator navi = this.navigatorPlugins.get(s);
if (navi != null) {
if (facets == null || facets.isEmpty() || !facets.containsKey(navi.getIndexFieldName())) { // just in case we got no solr facet
navi.incDocList(nodeList);
} else {
navi.incFacet(facets);
}
}
}
ReversibleScoreMap<String> fcts;
if (this.locationNavigator != null) {
fcts = facets.get(CollectionSchema.coordinate_p_0_coordinate.getSolrFieldName());
if (fcts != null) {
for (String coordinate: fcts) {
int hc = fcts.get(coordinate);
if (hc == 0) continue;
this.locationNavigator.inc(coordinate, hc);
}
}
}
if (this.dateNavigator != null) {
fcts = facets.get(CollectionSchema.dates_in_content_dts.getSolrFieldName());
if (fcts != null) this.dateNavigator.inc(fcts);
}
if (this.languageNavigator != null) {
fcts = facets.get(CollectionSchema.language_s.getSolrFieldName());
if (fcts != null) {
// remove unknown languages
Iterator<String> i = fcts.iterator();
while (i.hasNext()) {
String lang = i.next();
if (!ISO639.exists(lang)) {
i.remove();
}
}
this.languageNavigator.inc(fcts);
}
}
if (this.protocolNavigator != null) {
fcts = facets.get(CollectionSchema.url_protocol_s.getSolrFieldName());
if (fcts != null) {
// remove all protocols that we don't know
Iterator<String> i = fcts.iterator();
while (i.hasNext()) {
String protocol = i.next();
if ("http,https,smb,ftp,file".indexOf(protocol) < 0) i.remove();
}
this.protocolNavigator.inc(fcts);
}
}
// get the vocabulary navigation
Set<String> genericFacets = new LinkedHashSet<>();
for (Tagging v: LibraryProvider.autotagging.getVocabularies()) genericFacets.add(v.getName());
genericFacets.addAll(ProbabilisticClassifier.getContextNames());
for (String v: genericFacets) {
fcts = facets.get(CollectionSchema.VOCABULARY_PREFIX + v + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
if (fcts != null) {
ScoreMap<String> vocNav = this.vocabularyNavigator.get(v);
if (vocNav == null) {
vocNav = new ConcurrentScoreMap<String>();
this.vocabularyNavigator.put(v, vocNav);
}
vocNav.inc(fcts);
}
}
// apply all constraints
try {
pollloop: for (URIMetadataNode iEntry: nodeList) {
if ( !this.query.urlMask_isCatchall ) {
// check url mask
if (!iEntry.matches(this.query.urlMaskPattern)) {
if (log.isFine()) log.fine("dropped Node: url mask does not match");
continue pollloop;
}
}
// doublecheck for urls
if (this.urlhashes.has(iEntry.hash())) {
if (log.isFine()) log.fine("dropped Node: double check");
continue pollloop;
}
// increase flag counts
for ( int j = 0; j < 32; j++ ) {
if (iEntry.flags().get(j)) this.flagCount()[j]++;
}
// check constraints
Bitfield flags = iEntry.flags();
if (!this.testFlags(flags)) {
if (log.isFine()) log.fine("dropped Node: flag test");
continue pollloop;
}
// check document domain
if (this.query.contentdom.getCode() > 0 &&
((this.query.contentdom == ContentDomain.AUDIO && !(flags.get(Tokenizer.flag_cat_hasaudio))) ||
(this.query.contentdom == ContentDomain.VIDEO && !(flags.get(Tokenizer.flag_cat_hasvideo))) ||
(this.query.contentdom == ContentDomain.IMAGE && !(flags.get(Tokenizer.flag_cat_hasimage))) ||
(this.query.contentdom == ContentDomain.APP && !(flags.get(Tokenizer.flag_cat_hasapp))))) {
if (log.isFine()) log.fine("dropped Node: content domain does not match");
continue pollloop;
}
// filter out media links in text search, if wanted
String ext = MultiProtocolURL.getFileExtension(iEntry.url().getFileName());
if (this.query.contentdom == ContentDomain.TEXT && Classification.isImageExtension(ext) && this.excludeintext_image) {
if (log.isFine()) log.fine("dropped Node: file name domain does not match");
continue pollloop;
}
// check site constraints
final String hosthash = iEntry.hosthash();
if ( this.query.modifier.sitehash == null ) {
if (this.query.siteexcludes != null && this.query.siteexcludes.contains(hosthash)) {
if (log.isFine()) log.fine("dropped Node: siteexclude");
continue pollloop;
}
} else {
// filter out all domains that do not match with the site constraint
if (iEntry.url().getHost().indexOf(this.query.modifier.sitehost) < 0) {
if (log.isFine()) log.fine("dropped Node: sitehost");
continue pollloop;
}
}
if (this.query.modifier.language != null) {
if (!this.query.modifier.language.equals(iEntry.language())) {
if (log.isFine()) log.fine("dropped Node: language");
continue pollloop;
}
}
if (this.query.modifier.author != null) {
if (!this.query.modifier.author.equals(iEntry.dc_creator())) {
if (log.isFine()) log.fine ("dropped Node: author");
continue pollloop;
}
}
// finally extend the double-check and insert result to stack
this.urlhashes.putUnique(iEntry.hash());
rankingtryloop: while (true) {
try {
long score;
// determine nodestack ranking (will be altered by postranking)
// so far Solr score is used (with abitrary factor to get value similar to rwi ranking values)
Float scorex = (Float) iEntry.getFieldValue("score"); // this is a special field containing the ranking score of a Solr search result
if (scorex != null && scorex > 0)
score = (long) ((1000000.0f * scorex) - iEntry.urllength()); // we modify the score here since the solr score is equal in many cases and then the order would simply depend on the url hash which would be silly
else
score = this.order.cardinal(iEntry);
this.nodeStack.put(new ReverseElement<URIMetadataNode>(iEntry, score)); // inserts the element and removes the worst (which is smallest)
break rankingtryloop;
} catch (final ArithmeticException e ) {
// this may happen if the concurrent normalizer changes values during cardinal computation
continue rankingtryloop;
}
}
// increase counter for statistics
if (local) this.local_solr_available.incrementAndGet(); else this.remote_solr_available.incrementAndGet();
}
} catch (final SpaceExceededException e ) {
}
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEventType.PRESORT, resourceName, nodeList.size(), System.currentTimeMillis() - timer), false);
}
public void addExpectedRemoteReferences(int x) {
if ( x > 0 ) {
this.maxExpectedRemoteReferences.addAndGet(x);
}
this.expectedRemoteReferences.addAndGet(x);
}
/**
* Take one best entry from the rwiStack and create a node entry out of it.
* There is no waiting or blocking; if no entry is available this just returns null
* If the sjupDoubleDom option is selected, only different hosts are returned until no such rwi exists.
* Then the best entry from domain stacks are returned.
* @param skipDoubleDom
* @return a node from a rwi entry if one exist or null if not (with score value set)
*/
private URIMetadataNode pullOneRWI(final boolean skipDoubleDom) {
// returns from the current RWI list the best entry and removes this entry from the list
WeakPriorityBlockingQueue<WordReferenceVars> m;
WeakPriorityBlockingQueue.Element<WordReferenceVars> rwi = null;
mainloop: while (true) {
int c = 0;
pollloop: while (this.rwiStack.sizeQueue() > 0 && c++ < 10) {
rwi = this.rwiStack.poll();
if (rwi == null) return null;
if (!skipDoubleDom) {
URIMetadataNode node = this.query.getSegment().fulltext().getMetadata(rwi);
if (node == null) continue pollloop;
return node;
}
// check doubledom
final String hosthash = rwi.getElement().hosthash();
m = this.doubleDomCache.get(hosthash);
if (m == null) {
synchronized ( this.doubleDomCache ) {
m = this.doubleDomCache.get(hosthash);
if (m == null) {
// first appearance of dom. we create an entry to signal that one of that domain was already returned
m = new WeakPriorityBlockingQueue<WordReferenceVars>(max_results_rwi, false);
this.doubleDomCache.put(hosthash, m);
URIMetadataNode node = this.query.getSegment().fulltext().getMetadata(rwi);
if (node == null) continue pollloop;
return node;
}
// second appearances of dom
m.put(rwi);
}
} else {
m.put(rwi);
}
}
// no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
if (this.doubleDomCache.isEmpty()) {
//Log.logWarning("SearchEvent", "doubleDomCache.isEmpty");
return null;
}
// find best entry from all caches
WeakPriorityBlockingQueue.Element<WordReferenceVars> bestEntry = null;
WeakPriorityBlockingQueue.Element<WordReferenceVars> o;
final Iterator<WeakPriorityBlockingQueue<WordReferenceVars>> i = this.doubleDomCache.values().iterator();
doubleloop: while (i.hasNext()) {
try {
m = i.next();
} catch (final ConcurrentModificationException e) {
ConcurrentLog.logException(e);
continue mainloop; // not the best solution...
}
if (m == null) continue doubleloop;
if (m.isEmpty()) continue doubleloop;
if (bestEntry == null) {
bestEntry = m.peek();
continue doubleloop;
}
o = m.peek();
if (o == null) continue doubleloop;
if (o.getWeight() > bestEntry.getWeight()) bestEntry = o;
}
if (bestEntry == null) {
//Log.logWarning("SearchEvent", "bestEntry == null (1)");
return null;
}
// finally remove the best entry from the doubledom cache
m = this.doubleDomCache.get(bestEntry.getElement().hosthash());
if (m != null) {
bestEntry = m.poll();
if (bestEntry != null && m.sizeAvailable() == 0) {
synchronized ( this.doubleDomCache ) {
if (m.sizeAvailable() == 0) {
this.doubleDomCache.remove(bestEntry.getElement().hosthash());
}
}
}
}
if (bestEntry == null) {
//Log.logWarning("SearchEvent", "bestEntry == null (2)");
return null;
}
URIMetadataNode node = null;
try {
node = this.query.getSegment().fulltext().getMetadata(bestEntry);
} catch (Throwable e) {
ConcurrentLog.logException(e);
}
if (node == null) {
if (bestEntry.getElement().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
if (log.isFine()) log.fine("dropped RWI: hash not in metadata");
continue mainloop;
}
return node;
}
}
/**
* get one metadata entry from the ranked results. This will be the 'best' entry so far according to the
* applied ranking. If there are no more entries left or the timeout limit is reached then null is
* returned. The caller may distinguish the timeout case from the case where there will be no more also in
* the future by calling this.feedingIsFinished()
*
* @param skipDoubleDom should be true if it is wanted that double domain entries are skipped
* @return a metadata entry for a url (with score value set)
*/
public URIMetadataNode pullOneFilteredFromRWI(final boolean skipDoubleDom) {
// returns from the current RWI list the best URL entry and removes this entry from the list
int p = -1;
URIMetadataNode page;
mainloop: while ((page = pullOneRWI(skipDoubleDom)) != null) {
if (!this.query.urlMask_isCatchall && !page.matches(this.query.urlMaskPattern)) {
if (log.isFine()) log.fine("dropped RWI: no match with urlMask");
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
continue;
}
// check for more errors
if (page.url() == null) {
if (log.isFine()) log.fine("dropped RWI: url == null");
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
continue; // rare case where the url is corrupted
}
// check content domain
ContentDomain contentDomain = page.getContentDomain();
if (this.query.contentdom.getCode() > 0 && (
(this.query.contentdom == Classification.ContentDomain.IMAGE && contentDomain != Classification.ContentDomain.IMAGE) ||
(this.query.contentdom == Classification.ContentDomain.AUDIO && contentDomain != Classification.ContentDomain.AUDIO) ||
(this.query.contentdom == Classification.ContentDomain.VIDEO && contentDomain != Classification.ContentDomain.VIDEO) ||
(this.query.contentdom == Classification.ContentDomain.APP && contentDomain != Classification.ContentDomain.APP)) && this.query.urlMask_isCatchall) {
if (log.isFine()) log.fine("dropped RWI: wrong contentdom = " + this.query.contentdom + ", domain = " + contentDomain);
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
continue;
}
// filter out media links in text search, if wanted
String ext = MultiProtocolURL.getFileExtension(page.url().getFileName());
if (this.query.contentdom == ContentDomain.TEXT && Classification.isImageExtension(ext) && this.excludeintext_image) {
if (log.isFine()) log.fine("dropped RWI: file name domain does not match");
continue;
}
// filter query modifiers variables (these are host, filetype, protocol, language, author, collection, dates_in_content(on,from,to,timezone) )
// while ( protocol, host, filetype ) currently maybe incorporated in (this.query.urlMaskPattern) queryparam
// check modifier constraint filetype (using fileextension)
if (this.query.modifier.filetype != null && !this.query.modifier.filetype.equals(ext)) {
if (log.isFine()) log.fine("dropped RWI: file type constraint = " + this.query.modifier.filetype);
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
continue;
}
// check modifier constraint (language)
if (this.query.modifier.language != null && !this.query.modifier.language.equals(page.language())) {
if (log.isFine()) log.fine("dropped RWI: language constraint = " + this.query.modifier.language);
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
continue;
}
// check modifier constraint (author)
if (this.query.modifier.author != null && !page.dc_creator().toLowerCase().contains(this.query.modifier.author.toLowerCase()) /*!this.query.modifier.author.equalsIgnoreCase(page.dc_creator())*/) {
if (log.isFine()) log.fine("dropped RWI: author constraint = " + this.query.modifier.author);
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
continue;
}
// check modifier constraint collection
// this is not available in pure RWI entries (but in local or via solr query received metadate/entries),
if (this.query.modifier.collection != null) {
Collection<Object> docCols = page.getFieldValues(CollectionSchema.collection_sxt.getSolrFieldName()); // get multivalued value
if (docCols == null) { // no collection info
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
continue;
} else if (!docCols.contains(this.query.modifier.collection)) {
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
continue;
}
}
// Check for blacklist
if (Switchboard.urlBlacklist.isListed(BlacklistType.SEARCH, page.url())) {
if (log.isFine()) log.fine("dropped RWI: url is blacklisted in url blacklist");
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
continue;
}
// content control
if (Switchboard.getSwitchboard().getConfigBool("contentcontrol.enabled", false)) {
FilterEngine f = ContentControlFilterUpdateThread.getNetworkFilter();
if (f != null && !f.isListed(page.url(), null)) {
if (log.isFine()) log.fine("dropped RWI: url is blacklisted in contentcontrol");
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
continue;
}
}
final String pageurl = page.url().toNormalform(true);
final String pageauthor = page.dc_creator();
final String pagetitle = page.dc_title().toLowerCase();
// check exclusion
if (this.query.getQueryGoal().getExcludeSize() != 0 &&
((QueryParams.anymatch(pagetitle, this.query.getQueryGoal().getExcludeWords()))
|| (QueryParams.anymatch(pageurl.toLowerCase(), this.query.getQueryGoal().getExcludeWords()))
|| (QueryParams.anymatch(pageauthor.toLowerCase(), this.query.getQueryGoal().getExcludeWords())))) {
if (log.isFine()) log.fine("dropped RWI: no match with query goal exclusion");
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
continue;
}
// check index-of constraint
if ((this.query.constraint != null) && (this.query.constraint.get(Tokenizer.flag_cat_indexof)) && (!(pagetitle.startsWith("index of")))) {
final Iterator<byte[]> wi = this.query.getQueryGoal().getIncludeHashes().iterator();
if (this.query.getSegment().termIndex() != null) {
while (wi.hasNext()) {
this.query.getSegment().termIndex().removeDelayed(wi.next(), page.hash());
}
}
if (log.isFine()) log.fine("dropped RWI: url does not match index-of constraint");
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
continue;
}
// check location constraint
if ((this.query.constraint != null) && (this.query.constraint.get(Tokenizer.flag_cat_haslocation)) && (page.lat() == 0.0 || page.lon() == 0.0)) {
if (log.isFine()) log.fine("dropped RWI: location constraint");
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
continue;
}
// check geo coordinates
double lat, lon;
if (this.query.radius > 0.0d && this.query.lat != 0.0d && this.query.lon != 0.0d && (lat = page.lat()) != 0.0d && (lon = page.lon()) != 0.0d) {
double latDelta = this.query.lat - lat;
double lonDelta = this.query.lon - lon;
double distance = Math.sqrt(latDelta * latDelta + lonDelta * lonDelta); // pythagoras
if (distance > this.query.radius) {
if (log.isFine()) log.fine("dropped RWI: radius constraint");
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
continue;
}
}
// check vocabulary terms (metatags) {only available in Solr index as vocabulary_xxyyzzz_sxt field}
// TODO: vocabulary is only valid and available in local Solr index (consider to auto-switch to Searchdom.LOCAL)
if (this.query.metatags != null && !this.query.metatags.isEmpty()) {
tagloop: for (Tagging.Metatag tag : this.query.metatags) {
SolrDocument sdoc = page;
if (sdoc != null) {
Collection<Object> tagvalues = sdoc.getFieldValues(CollectionSchema.VOCABULARY_PREFIX + tag.getVocabularyName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
if (tagvalues != null && tagvalues.contains(tag.getObject())) {
continue tagloop; // metatag exists check next tag (filter may consist of several tags)
}
} // if we reach this point the metatag was not found (= drop entry)
if (log.isFine()) log.fine("dropped RWI: url not tagged with vocabulary " + tag.getVocabularyName());
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
continue mainloop;
}
}
// from here: collect navigation information
// TODO: it may be a little bit late here, to update navigator counters
// iterate over active navigator plugins (the rwi metadata may contain the field the plugin counts)
for (String s : this.navigatorPlugins.keySet()) {
Navigator navi = this.navigatorPlugins.get(s);
if (navi != null) {
navi.incDoc(page);
}
}
return page; // accept url
}
return null;
}
public long getURLRetrievalTime() {
return this.urlRetrievalAllTime;
}
public long getSnippetComputationTime() {
return this.snippetComputationAllTime;
}
/**
* Get topics in a ScoreMap if config allows topic navigator
* (the topics are filtered by badwords, stopwords and words included in the query)
*
* @param count max number of topics returned
* @return ScoreMap with max number of topics or null if
*/
public ScoreMap<String> getTopicNavigator(final int count) {
if (this.topicNavigatorCount > 0 && count >= 0) { //topicNavigatorCount set during init, 0=no nav
if (!this.ref.sizeSmaller(2)) {
ScoreMap<String> result;
int ic = count != 0 ? count : this.topicNavigatorCount;
if (this.ref.size() <= ic) { // size matches return map directly
result = this.getTopics(/*ic, 500*/);
} else { // collect top most count topics
result = new ConcurrentScoreMap<String>();
Iterator<String> it = this.getTopics(/*ic, 500*/).keys(false);
while (ic-- > 0 && it.hasNext()) {
String word = it.next();
result.set(word, this.ref.get(word));
}
}
return result;
}
}
return null;
}
/**
* Adds the retrieved results (fulltext & rwi) to the result list and
* computes the text snippets
* @return true on adding entries to resultlist otherwise false
*/
public boolean drainStacksToResult() {
// we take one entry from both stacks at the same time
boolean success = false;
final Element<URIMetadataNode> localEntryElement = this.nodeStack.sizeQueue() > 0 ? this.nodeStack.poll() : null;
final URIMetadataNode node = localEntryElement == null ? null : localEntryElement.getElement();
if (node != null) {
LinkedHashSet<String> solrsnippetlines = this.snippets.remove(ASCII.String(node.hash())); // we can remove this because it's used only once
if (solrsnippetlines != null && solrsnippetlines.size() > 0) {
OpensearchResponseWriter.removeSubsumedTitle(solrsnippetlines, node.dc_title());
final TextSnippet solrsnippet = new TextSnippet(node.hash(), OpensearchResponseWriter.getLargestSnippet(solrsnippetlines), true, ResultClass.SOURCE_CACHE, "");
final TextSnippet yacysnippet = new TextSnippet(this.loader,
node,
this.query.getQueryGoal().getIncludeHashes(),
CacheStrategy.CACHEONLY,
false,
180,
false);
final String solrsnippetline = solrsnippet.descriptionline(this.getQuery().getQueryGoal());
final String yacysnippetline = yacysnippet.descriptionline(this.getQuery().getQueryGoal());
URIMetadataNode re = node.makeResultEntry(this.query.getSegment(), this.peers, solrsnippetline.length() > yacysnippetline.length() ? solrsnippet : yacysnippet);
addResult(re, localEntryElement.getWeight());
success = true;
} else {
// we don't have a snippet from solr, try to get it in our way (by reloading, if necessary)
if (SearchEvent.this.snippetFetchAlive.get() >= 10) {
// too many concurrent processes
addResult(getSnippet(node, null), localEntryElement.getWeight());
success = true;
} else {
new Thread("SearchEvent.drainStacksToResult.getSnippet") {
@Override
public void run() {
SearchEvent.this.oneFeederStarted();
try {
SearchEvent.this.snippetFetchAlive.incrementAndGet();
try {
addResult(getSnippet(node, SearchEvent.this.query.snippetCacheStrategy), localEntryElement.getWeight());
} catch (final Throwable e) {} finally {
SearchEvent.this.snippetFetchAlive.decrementAndGet();
}
} catch (final Throwable e) {} finally {
SearchEvent.this.oneFeederTerminated();
}
}
}.start();
}
}
}
if (SearchEvent.this.snippetFetchAlive.get() >= 10 || MemoryControl.shortStatus()) {
// too many concurrent processes
final URIMetadataNode noderwi = pullOneFilteredFromRWI(true);
if (noderwi != null) {
addResult(getSnippet(noderwi, null), noderwi.score());
success = true;
}
} else {
Thread t = new Thread("SearchEvent.drainStacksToResult.oneFilteredFromRWI") {
@Override
public void run() {
SearchEvent.this.oneFeederStarted();
try {
final URIMetadataNode noderwi = pullOneFilteredFromRWI(true);
if (noderwi != null) {
SearchEvent.this.snippetFetchAlive.incrementAndGet();
try {
addResult(getSnippet(noderwi, SearchEvent.this.query.snippetCacheStrategy), noderwi.score());
} catch (final Throwable e) {
ConcurrentLog.logException(e);
} finally {
SearchEvent.this.snippetFetchAlive.decrementAndGet();
}
}
} catch (final Throwable e) {} finally {
SearchEvent.this.oneFeederTerminated();
}
}
};
if (SearchEvent.this.query.snippetCacheStrategy == null) t.run(); else t.start(); //no need for concurrency if there is no latency
}
return success;
}
/**
* place the result to the result vector and apply post-ranking
* post-ranking is added to the current score,
* @param resultEntry to add
* @param score current ranking
*/
public void addResult(URIMetadataNode resultEntry, final long score) {
if (resultEntry == null) return;
final long ranking = (score * 128) + postRanking(resultEntry, this.ref /*this.getTopicNavigator(MAX_TOPWORDS)*/);
// TODO: above was originally using (see below), but getTopicNavigator returns this.ref and possibliy alters this.ref on first call (this.ref.size < 2 -> this.ref.clear)
// TODO: verify and straighten the use of addTopic, getTopic and getTopicNavigator and related score calculation
// final long ranking = ((long) (score * 128.f)) + postRanking(resultEntry, this.getTopicNavigator(MAX_TOPWORDS));
resultEntry.setScore(ranking); // update the score of resultEntry for access by search interface / api
this.resultList.put(new ReverseElement<URIMetadataNode>(resultEntry, ranking)); // remove smallest in case of overflow
if (pollImmediately) this.resultList.poll(); // prevent re-ranking in case there is only a single index source which has already ranked entries.
this.addTopics(resultEntry);
}
private long postRanking(final URIMetadataNode rentry, final ScoreMap<String> topwords) {
long r = 0;
// for media search: prefer pages with many links
switch (this.query.contentdom) {
case IMAGE:
r += rentry.limage() << this.query.ranking.coeff_cathasimage;
break;
case AUDIO:
r += rentry.laudio() << this.query.ranking.coeff_cathasaudio;
break;
case VIDEO:
r += rentry.lvideo() << this.query.ranking.coeff_cathasvideo;
break;
case APP:
r += rentry.lapp() << this.query.ranking.coeff_cathasapp;
}
// apply citation count
//System.out.println("POSTRANKING CITATION: references = " + rentry.referencesCount() + ", inbound = " + rentry.llocal() + ", outbound = " + rentry.lother());
if (this.query.getSegment().connectedCitation()) {
int referencesCount = this.query.getSegment().urlCitation().count(rentry.hash());
r += (128 * referencesCount / (1 + 2 * rentry.llocal() + rentry.lother())) << this.query.ranking.coeff_citation;
}
// prefer hit with 'prefer' pattern
if (this.query.prefer.matcher(rentry.url().toNormalform(true)).matches()) r += 255 << this.query.ranking.coeff_prefer;
if (this.query.prefer.matcher(rentry.title()).matches()) r += 255 << this.query.ranking.coeff_prefer;
// apply 'common-sense' heuristic using references
final String urlstring = rentry.url().toNormalform(true);
final String[] urlcomps = MultiProtocolURL.urlComps(urlstring);
final String[] descrcomps = MultiProtocolURL.splitpattern.split(rentry.title().toLowerCase());
// apply query-in-result matching
final QueryGoal.NormalizedWords urlcompmap = new QueryGoal.NormalizedWords(urlcomps);
final QueryGoal.NormalizedWords descrcompmap = new QueryGoal.NormalizedWords(descrcomps);
// the token map is used (instead of urlcomps/descrcomps) to determine appearance in url/title and eliminate double occurances
// (example Title="News News News News News News - today is party -- News News News News News News" to add one score instead of 12 * score !)
for (final String urlcomp : urlcompmap) {
int tc = topwords.get(urlcomp);
if (tc > 0) r += tc << this.query.ranking.coeff_urlcompintoplist;
}
for (final String descrcomp : descrcompmap) {
int tc = topwords.get(descrcomp);
if (tc > 0) r += tc << this.query.ranking.coeff_descrcompintoplist;
}
final Iterator<String> shi = this.query.getQueryGoal().getIncludeWords();
String queryword;
while (shi.hasNext()) {
queryword = shi.next();
if (urlcompmap.contains(queryword)) r += 255 << this.query.ranking.coeff_appurl;
if (descrcompmap.contains(queryword)) r += 255 << this.query.ranking.coeff_app_dc_title;
}
return r;
}
public URIMetadataNode getSnippet(URIMetadataNode page, final CacheStrategy cacheStrategy) {
if (page == null) return null;
if (cacheStrategy == null) {
final TextSnippet snippet = new TextSnippet(
null,
page,
this.snippetFetchWordHashes,
null,
((this.query.constraint != null) && (this.query.constraint.get(Tokenizer.flag_cat_indexof))),
SearchEvent.SNIPPET_MAX_LENGTH,
!this.query.isLocal());
return page.makeResultEntry(this.query.getSegment(), this.peers, snippet); // result without snippet
}
// load snippet
ContentDomain contentDomain = page.getContentDomain();
if (contentDomain == Classification.ContentDomain.TEXT || contentDomain == Classification.ContentDomain.ALL) {
// attach text snippet
long startTime = System.currentTimeMillis();
final TextSnippet snippet = new TextSnippet(
this.loader,
page,
this.snippetFetchWordHashes,
cacheStrategy,
((this.query.constraint != null) && (this.query.constraint.get(Tokenizer.flag_cat_indexof))),
180,
!this.query.isLocal());
SearchEvent.log.info("text snippet load time for " + page.url().toNormalform(true) + ": " + (System.currentTimeMillis() - startTime) + " ms, " + (!snippet.getErrorCode().fail() ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
if (!snippet.getErrorCode().fail()) {
// we loaded the file and found the snippet
return page.makeResultEntry(this.query.getSegment(), this.peers, snippet); // result with snippet attached
} else if (cacheStrategy.mustBeOffline()) {
// we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
// this may happen during a remote search, because snippet loading is omitted to retrieve results faster
return page.makeResultEntry(this.query.getSegment(), this.peers, null); // result without snippet
} else {
// problems with snippet fetch
if (this.snippetFetchWordHashes.has(Segment.catchallHash)) {
// we accept that because the word cannot be on the page
return page.makeResultEntry(this.query.getSegment(), this.peers, null);
}
final String reason = "no text snippet; errorCode = " + snippet.getErrorCode();
if (this.deleteIfSnippetFail) {
this.workTables.failURLsRegisterMissingWord(this.query.getSegment().termIndex(), page.url(), this.query.getQueryGoal().getIncludeHashes());
}
SearchEvent.log.info("sorted out url " + page.url().toNormalform(true) + " during search: " + reason);
return null;
}
}
return page.makeResultEntry(this.query.getSegment(), this.peers, null); // result without snippet
}
/**
* This is the access point for the search interface to retrive ranked results.
* for display.
*
* @param item requested result counting number (starting at 0)
* @param timeout
* @return
*/
public URIMetadataNode oneResult(final int item, final long timeout) {
// check if we already retrieved this item
// (happens if a search pages is accessed a second time)
final long finishTime = timeout == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + timeout;
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEventType.ONERESULT, "started, item = " + item + ", available = " + this.getResultCount(), 0, 0), false);
// wait until a local solr is finished, we must do that to be able to check if we need more
if (this.localsolrsearch != null && this.localsolrsearch.isAlive()) {
try {
this.localsolrsearch.join(100);
} catch (final InterruptedException e) {
log.warn("Wait for local solr search was interrupted.");
}
}
if (item >= this.localsolroffset && this.local_solr_stored.get() == 0 && (this.localsolrsearch != null && this.localsolrsearch.isAlive())) {
try {
this.localsolrsearch.join();
} catch (final InterruptedException e) {
log.warn("Wait for local solr search was interrupted.");
}
}
if (this.remote && item >= this.localsolroffset && this.local_solr_stored.get() >= item) {
/* Request mixing remote and local Solr results : load remaining local solr results now.
* For local only search, a new SearchEvent should be created, starting directly at the requested offset,
* thus allowing to handle last pages of large resultsets
*/
int nextitems = item - this.localsolroffset + this.query.itemsPerPage; // example: suddenly switch to item 60, just 10 had been shown, 20 loaded.
if (this.localsolrsearch != null && this.localsolrsearch.isAlive()) {try {this.localsolrsearch.join();} catch (final InterruptedException e) {}}
if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_SOLR_OFF, false)) {
this.localsolrsearch = RemoteSearch.solrRemoteSearch(this,
this.query.solrQuery(this.query.contentdom, false, this.excludeintext_image),
this.localsolroffset, nextitems, null /* this peer */, 0, Switchboard.urlBlacklist);
}
this.localsolroffset += nextitems;
}
// now pull results as long as needed and as long as possible
if (this.remote && item < 10 && this.resultList.sizeAvailable() <= item) {
try {
Thread.sleep(100);
} catch (final InterruptedException e) {
log.warn("Remote search results wait was interrupted.");
}
}
final int resultListIndex;
if (this.remote) {
resultListIndex = item;
} else {
resultListIndex = item - (this.localsolroffset - this.query.itemsPerPage);
}
while ( this.resultList.sizeAvailable() <= resultListIndex &&
(this.rwiQueueSize() > 0 || this.nodeStack.sizeQueue() > 0 ||
(!this.feedingIsFinished() && System.currentTimeMillis() < finishTime))) {
if (!drainStacksToResult()) {
try {
Thread.sleep(10);
} catch (final InterruptedException e) {
log.warn("Search results wait was interrupted.");
}
}
}
// check if we have a success
if (this.resultList.sizeAvailable() > resultListIndex) {
// we have the wanted result already in the result array .. return that
final URIMetadataNode re = this.resultList.element(resultListIndex).getElement();
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEventType.ONERESULT, "fetched, item = " + item + ", available = " + this.getResultCount() + ": " + re.urlstring(), 0, 0), false);
/*
if (this.localsolrsearch == null || (!this.localsolrsearch.isAlive() && this.local_solr_stored.get() > this.localsolroffset && (item + 1) % this.query.itemsPerPage == 0)) {
// at the end of a list, trigger a next solr search
if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_SOLR_OFF, false)) {
this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, false, this.excludeintext_image), this.localsolroffset, this.query.itemsPerPage, null, 0, Switchboard.urlBlacklist);
}
this.localsolroffset += this.query.itemsPerPage;
}
*/
return re;
}
// no success
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), SearchEventType.ONERESULT, "not found, item = " + item + ", available = " + this.getResultCount(), 0, 0), false);
return null;
}
/** Image results counter */
private int imagePageCounter = 0;
private LinkedHashMap<String, ImageResult> imageViewed = new LinkedHashMap<String, ImageResult>();
private LinkedHashMap<String, ImageResult> imageSpareGood = new LinkedHashMap<String, ImageResult>();
private LinkedHashMap<String, ImageResult> imageSpareBad = new LinkedHashMap<String, ImageResult>();
private ImageResult nthImage(int item) {
Object o = SetTools.nth(this.imageViewed.values(), item);
if (o == null) return null;
return (ImageResult) o;
}
private boolean hasSpare() {
return imageSpareGood.size() > 0 || imageSpareBad.size() > 0;
}
private boolean containsSpare(String id) {
return imageSpareGood.containsKey(id) || imageSpareBad.containsKey(id);
}
private int sizeSpare() {
return imageSpareGood.size() + imageSpareBad.size();
}
private ImageResult nextSpare() {
if (imageSpareGood.size() > 0) {
Map.Entry<String, ImageResult> next = imageSpareGood.entrySet().iterator().next();
imageViewed.put(next.getKey(), next.getValue());
imageSpareGood.remove(next.getKey());
return next.getValue();
}
if (imageSpareBad.size() > 0) {
Map.Entry<String, ImageResult> next = imageSpareBad.entrySet().iterator().next();
imageViewed.put(next.getKey(), next.getValue());
imageSpareBad.remove(next.getKey());
return next.getValue();
}
return null;
}
public ImageResult oneImageResult(final int item, final long timeout) throws MalformedURLException {
if (item < imageViewed.size()) return nthImage(item);
if (imageSpareGood.size() > 0) return nextSpare(); // first put out all good spare, but no bad spare
URIMetadataNode doc = oneResult(imagePageCounter++, timeout); // we must use a different counter here because the image counter can be higher when one page filled up several spare
// check if the match was made in the url or in the image links
if (doc == null) {
if (hasSpare()) return nextSpare();
throw new MalformedURLException("no image url found");
}
// try to get more
// there can be two different kinds of image hits: either the document itself is an image or images are embedded in the links of text documents.
// boolean fakeImageHost = ms.url().getHost() != null && ms.url().getHost().indexOf("wikipedia") > 0; // pages with image extension from wikipedia do not contain image files but html files... I know this is a bad hack, but many results come from wikipedia and we must handle that
// generalize above hack (regarding url with file extension but beeing a html (with html mime)
if (doc.doctype() == Response.DT_IMAGE) {
/* Icons are not always .ico files and should now be indexed in icons_urlstub_sxt. But this test still makes sense for older indexed documents,
* or documents coming from previous versions peers */
if (!doc.url().getFileName().endsWith(".ico")) { // we don't want favicons
final String id = ASCII.String(doc.hash());
// check image size
final Collection<Object> height = doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName());
final Collection<Object> width = doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName());
int h = height == null ? 0 : (Integer) height.iterator().next(); // might be -1 for unknown
int w = width == null ? 0 : (Integer) width.iterator().next();
if ((h <= 0 || h > 16) && (w <= 0 || w > 16)) { // we don't want too small images (< 16x16)
if (!imageViewed.containsKey(id) && !containsSpare(id)) imageSpareGood.put(id, new ImageResult(doc.url(), doc.url(), doc.mime(), doc.title(), w, h, 0));
}
}
} else {
Collection<Object> altO = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName());
Collection<Object> imgO = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName());
if (imgO != null && imgO.size() > 0 && imgO instanceof List<?>) {
List<Object> alt = altO == null ? null : (List<Object>) altO;
List<Object> img = (List<Object>) imgO;
List<String> prt = CollectionConfiguration.indexedList2protocolList(doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName()), img.size());
Collection<Object> heightO = doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName());
Collection<Object> widthO = doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName());
List<Object> height = heightO == null ? null : (List<Object>) heightO;
List<Object> width = widthO == null ? null : (List<Object>) widthO;
for (int c = 0; c < img.size(); c++) {
String image_urlstub = (String) img.get(c);
/* Icons are not always .ico files and should now be indexed in icons_urlstub_sxt. But this test still makes sense for older indexed documents,
* or documents coming from previous versions peers */
if (image_urlstub.endsWith(".ico")) continue; // we don't want favicons, makes the result look idiotic
try {
int h = height == null ? 0 : (Integer) height.get(c);
int w = width == null ? 0 : (Integer) width.get(c);
// check size good for display (parser may init unknown dimension with -1)
if (h > 0 && h <= 16) continue; // to small for display
if (w > 0 && w <= 16) continue; // to small for display
DigestURL imageUrl = new DigestURL((prt != null && prt.size() > c ? prt.get(c) : "http") + "://" + image_urlstub);
String id = ASCII.String(imageUrl.hash());
if (!imageViewed.containsKey(id) && !containsSpare(id)) {
String image_alt = (alt != null && alt.size() > c) ? (String) alt.get(c) : "";
ImageResult imageResult = new ImageResult(doc.url(), imageUrl, "", image_alt, w, h, 0);
boolean match = (query.getQueryGoal().matches(image_urlstub) || query.getQueryGoal().matches(image_alt));
if (match) imageSpareGood.put(id, imageResult); else imageSpareBad.put(id, imageResult);
}
} catch (MalformedURLException e) {
continue;
}
}
}
}
if (hasSpare()) return nextSpare();
throw new MalformedURLException("no image url found");
}
public class ImageResult {
public DigestURL imageUrl, sourceUrl;
public String mimetype = "", imagetext = "";
public int width = 0, height = 0, fileSize = 0;
public ImageResult(DigestURL sourceUrl, DigestURL imageUrl, String mimetype, String imagetext, int width, int height, int fileSize) {
this.sourceUrl = sourceUrl;
this.imageUrl = imageUrl;
this.mimetype = mimetype;
this.imagetext = imagetext.isEmpty() ? imageUrl.getFileName() : imagetext;
this.width = width;
this.height = height;
this.fileSize = fileSize;
}
@Override
public String toString() {
return this.imageUrl.toNormalform(false);
}
}
public ArrayList<WeakPriorityBlockingQueue.Element<URIMetadataNode>> completeResults(final long waitingtime) {
final long timeout = waitingtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + waitingtime;
int i = 0;
while (this.resultList.sizeAvailable() < this.query.neededResults() && System.currentTimeMillis() < timeout) {
URIMetadataNode re = oneResult(i++, timeout - System.currentTimeMillis());
if (re == null) break;
}
return this.resultList.list(Math.min(this.query.neededResults(), this.resultList.sizeAvailable()));
}
/**
* delete a specific entry from the search results
* this is used if the user clicks on a '-' sign beside the search result
* @param urlhash
* @return true if an entry was deleted, false otherwise
*/
protected boolean delete(final String urlhash) {
final Iterator<Element<URIMetadataNode>> i = this.resultList.iterator();
Element<URIMetadataNode> entry;
while (i.hasNext()) {
entry = i.next();
if (urlhash.equals(ASCII.String(entry.getElement().url().hash()))) {
i.remove();
return true;
}
}
return false;
}
public ReferenceOrder getOrder() {
return this.order;
}
protected boolean feedingIsFinished() {
return
this.feedersTerminated.intValue() > (this.remote ? 1 : 0) &&
this.feedersAlive.get() == 0;
}
/**
* method to signal the incoming stack that one feeder has terminated
*/
public void oneFeederTerminated() {
this.feedersTerminated.incrementAndGet();
final int c = this.feedersAlive.decrementAndGet();
assert c >= 0 : "feeders = " + c;
}
public void oneFeederStarted() {
this.feedersAlive.incrementAndGet();
}
public QueryParams getQuery() {
return this.query;
}
public int[] flagCount() {
return this.flagcount;
}
protected void addBegin() {
this.addRunning = true;
}
public void addFinalize() {
this.addRunning = false;
}
protected boolean addRunning() {
return this.addRunning;
}
public boolean rwiIsEmpty() {
if ( !this.rwiStack.isEmpty() ) {
return false;
}
for ( final WeakPriorityBlockingQueue<WordReferenceVars> s : this.doubleDomCache.values() ) {
if ( !s.isEmpty() ) {
return false;
}
}
return true;
}
protected int rwiQueueSize() {
int c = this.rwiStack.sizeQueue();
for ( final WeakPriorityBlockingQueue<WordReferenceVars> s : this.doubleDomCache.values() ) {
c += s.sizeQueue();
}
return c;
}
protected boolean testFlags(final Bitfield flags) {
if (this.query.constraint == null) return true;
// test if ientry matches with filter
// if all = true: let only entries pass that has all matching bits
// if all = false: let all entries pass that has at least one matching bit
if (this.query.allofconstraint) {
for ( int i = 0; i < 32; i++ ) {
if ((this.query.constraint.get(i)) && (!flags.get(i))) return false;
}
return true;
}
for (int i = 0; i < 32; i++) {
if ((this.query.constraint.get(i)) && (flags.get(i))) return true;
}
return false;
}
protected Map<byte[], ReferenceContainer<WordReference>> searchContainerMap() {
// direct access to the result maps is needed for abstract generation
// this is only available if execQuery() was called before
return this.localSearchInclusion;
}
/**
* Return the list of words that had been computed by statistics over all
* words that appeared in the url or the description of all urls
*
* @return ScoreMap
*/
public ScoreMap<String> getTopics(/* final int maxcount, final long maxtime */) {
/* ---------------------------------- start of rem (2016-09-03)
// TODO: result map is not used currently, verify if it should and use or delete this code block
// TODO: as it is not used now - in favour of performance this code block is rem'ed (2016-09-03)
final ScoreMap<String> result = new ConcurrentScoreMap<String>();
if ( this.ref.sizeSmaller(2) ) {
this.ref.clear(); // navigators with one entry are not useful
}
final Map<String, Float> counts = new HashMap<String, Float>();
final Iterator<String> i = this.ref.keys(false);
String word;
int c;
float q, min = Float.MAX_VALUE, max = Float.MIN_VALUE;
int ic = maxcount;
long timeout = maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime;
while ( ic-- > 0 && i.hasNext() && System.currentTimeMillis() < timeout) {
word = i.next();
if ( word == null ) {
continue;
}
c = this.query.getSegment().getWordCountGuess(word);
if ( c > 0 ) {
q = ((float) this.ref.get(word)) / ((float) c);
min = Math.min(min, q);
max = Math.max(max, q);
counts.put(word, q);
}
}
if ( max > min ) {
for ( final Map.Entry<String, Float> ce : counts.entrySet() ) {
result.set(ce.getKey(), (int) (((double) maxcount) * (ce.getValue() - min) / (max - min)));
}
}
/* ------------------------------------ end of rem (2016-09-03) */
return this.ref;
}
private final static Pattern lettermatch = Pattern.compile("[a-z]+");
/**
* Collects topics in a ScoreMap for words not included in the query words.
* Words are also filtered by badword blacklist and stopword list.
* @param words
*/
public void addTopic(final String[] words) {
String word;
for ( final String w : words ) {
word = w.toLowerCase();
if ( word.length() > 2
&& "http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_zum_der_die_das_und_the_zur_bzw_mit_blog_wiki_aus_bei_off"
.indexOf(word) < 0
&& !this.query.getQueryGoal().containsInclude(word)
&& lettermatch.matcher(word).matches()
&& !Switchboard.badwords.contains(word)
&& !Switchboard.stopwords.contains(word) ) {
this.ref.inc(word);
}
}
}
/**
* Ad title words to this searchEvent's topic score map
* @param resultEntry
*/
protected void addTopics(final URIMetadataNode resultEntry) {
// take out relevant information for reference computation
if ((resultEntry.url() == null) || (resultEntry.title() == null)) return;
final String[] descrcomps = MultiProtocolURL.splitpattern.split(resultEntry.title()); // words in the description
// add references
addTopic(descrcomps);
}
}