/**
* FederateSearchManager.java
* Copyright 2015 by Burkhard Buelte
* First released 19.01.2015 at http://yacy.net
*
* This library is free software; you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt If not, see
* <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.federate;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.opensearch.OpenSearchConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.storage.Configuration;
import net.yacy.cora.storage.Configuration.Entry;
import net.yacy.cora.storage.Files;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.robots.RobotsTxtEntry;
import net.yacy.document.parser.xml.opensearchdescriptionReader;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.query.QueryGoal;
import net.yacy.search.query.QueryModifier;
import net.yacy.search.query.QueryParams;
import net.yacy.search.query.SearchEvent;
import net.yacy.search.schema.WebgraphSchema;
/**
* Handling of queries to configured remote OpenSearch systems.
*/
public class FederateSearchManager {
/** Delay between connects (in ms) */
private final int accessDelay = 15000;
private File confFile = null; // later initialized to DATA/SETTINGS/heuristicopensearch.conf
/** Connectors list */
private HashSet<AbstractFederateSearchConnector> conlist;
/** PropertiesConfiguration cfg */
protected Configuration cfg;
/** Switchboard instance */
private Switchboard switchboard;
/** Self reference for static .getManager() */
private static FederateSearchManager manager = null;
/**
* @param sb switchboard instance. Must not be null.
*/
public FederateSearchManager(Switchboard sb) {
super();
this.conlist = new HashSet<AbstractFederateSearchConnector>();
// from here we need Switchboard settings
if (sb == null) {
return;
}
this.switchboard = sb;
// Data needed active name, url(template), desc, rule-when-to-use, specifics
confFile = new File(sb.getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf");
if (!confFile.exists()) {
try {
Files.copy(new File(sb.appPath, "defaults/heuristicopensearch.conf"), confFile);
File defdir = new File(sb.dataPath, "DATA/SETTINGS/federatecfg");
if (!defdir.exists()) {
Files.copy(new File(sb.appPath, "defaults/federatecfg"), defdir);
}
} catch (IOException ex) {
}
}
// read settings config file
if (confFile.exists()) {
try {
cfg = new Configuration(confFile);
Iterator<Entry> it = cfg.entryIterator();
while (it.hasNext()) {
Entry cfgentry = it.next();
String url = cfgentry.getValue();
if (cfgentry.enabled() && url != null && !url.isEmpty()) {
String name = cfgentry.key();
if (url.startsWith("cfgfile:")) { // is cfgfile with field mappings (no opensearch url)
// format prefix:connectortype:configfilename
// example cfgfile:solrconnector:testsys.solr.schema
String[] parts = url.split(":");
if (parts[1].equalsIgnoreCase("solrconnector")) {
SolrFederateSearchConnector sfc = new SolrFederateSearchConnector();
if (sfc.init(name, sb.getDataPath()+ "/DATA/SETTINGS/federatecfg/" + parts[2])) {
conlist.add(sfc);
}
} else {
ConcurrentLog.config("FederateSearchManager", "Error in configuration of: " + url);
}
} else { // handle opensearch url template
OpenSearchConnector osc = new OpenSearchConnector(url);
if (osc.init(name, sb.getDataPath()+ "/DATA/SETTINGS/federatecfg/" + OpenSearchConnector.htmlMappingFileName(name))) {
conlist.add(osc);
}
}
}
}
} catch (IOException ex) {
ConcurrentLog.logException(ex);
}
}
manager = this; // reference for static access via .getManager()
}
/**
* Get instance of this manager. There should be only one instance running,
* use this to get or initialize the manager.
*
* @return
*/
public static FederateSearchManager getManager() {
if (manager == null) {
manager = new FederateSearchManager(Switchboard.getSwitchboard());
}
return manager;
}
/**
* Sends a query request to remote systems configured.
* If search query domain is LOCAL procedure does nothing.
*
* @param theSearch
*/
public void search(SearchEvent theSearch) {
if (theSearch != null) {
if (!theSearch.query.isLocal() && !MemoryControl.shortStatus()) {
Set<AbstractFederateSearchConnector> picklist = getBest(theSearch.getQuery());
for (AbstractFederateSearchConnector fsc : picklist) {
fsc.search(theSearch);
}
}
}
}
/**
* Sends a query to configured remote systems.
*
* @param query
* @return list of results according to YaCy schema
*/
public List<URIMetadataNode> query(QueryParams query) {
if (!query.isLocal() && !MemoryControl.shortStatus()) {
List<URIMetadataNode> sdl = new ArrayList<URIMetadataNode>();
Set<AbstractFederateSearchConnector> picklist = getBest(query);
for (AbstractFederateSearchConnector fsc : picklist) {
sdl.addAll(fsc.query(query));
}
return sdl;
}
return null;
}
/**
* Takes a search string, converts it to queryparams and calls the
* query(queryparams)
*
* @param querystr
* @return SolrDocumentlist of remote query results according to YaCy schema
*/
public List<URIMetadataNode> query(String querystr) {
final QueryGoal qg = new QueryGoal(querystr);
Bitfield filter = new Bitfield();
final QueryParams query = new QueryParams(
qg,
new QueryModifier(0),
Integer.MAX_VALUE,
"",
Classification.ContentDomain.ALL,
"", //lang
0, //timezoneOffset
null,
CacheStrategy.IFFRESH,
100, 0, //count, offset
".*", //urlmask
null,
null,
QueryParams.Searchdom.LOCAL,
filter,
false,
null,
MultiProtocolURL.TLD_any_zone_filter,
"",
false,
this.switchboard.index,
this.switchboard.getRanking(),
"",//userAgent
0.0d, 0.0d, 0.0d,
new String[0]);
return query(query);
}
/**
* Add a search target system/connector to the config file
*
* @param urlTemplate query template url
* @return successful added
*/
public boolean addOpenSearchTarget(String name, String urlTemplate, boolean active, String comment) {
if (confFile == null) {
return false;
}
try {
Configuration conf = new Configuration(confFile);
if (name != null && !name.isEmpty()) {
conf.add(name, null, active);
Configuration.Entry e = conf.get(name);
e.setValue(urlTemplate);
e.setEnable(active);
e.setComment(comment);
conf.put(name, e);
try {
conf.commit();
if (active) {
OpenSearchConnector osd = new OpenSearchConnector(urlTemplate);
String htmlMappingFile = this.switchboard.getDataPath()+ "/DATA/SETTINGS/federatecfg/" + OpenSearchConnector.htmlMappingFileName(name);
if (osd.init(name, htmlMappingFile)) {
conlist.add(osd);
}
}
} catch (final IOException ex) {
ConcurrentLog.warn("FederateSearchManager", "config file write error");
}
return true;
}
} catch (final IOException e1) {
ConcurrentLog.logException(e1);
return false;
}
return false;
}
/**
* Get the number of active remote query target systems
*/
public int getSize() {
return conlist.size();
}
/**
* Get best systems from configured targets for this search
*
* @param theSearch
* @return list of searchtargetconnectors
*/
protected Set<AbstractFederateSearchConnector> getBest(final QueryParams query) {
HashSet<AbstractFederateSearchConnector> retset = new HashSet<AbstractFederateSearchConnector>();
MultiProtocolURL connectorURL;
for (AbstractFederateSearchConnector fsc : conlist) {
try {
connectorURL = new MultiProtocolURL(fsc.baseurl);
} catch (MalformedURLException e) {
ConcurrentLog.warn("FederateSearchManager", "Malformed connector URL : " + fsc.baseurl);
continue;
}
RobotsTxtEntry robotsEntry = null;
int robotsDelay = 0;
if (this.switchboard != null && this.switchboard.robots != null) {
robotsEntry = this.switchboard.robots.getEntry(connectorURL,
ClientIdentification.yacyInternetCrawlerAgent);
if(robotsEntry != null) {
robotsDelay = robotsEntry.getCrawlDelayMillis();
}
}
// check access time
long currentTime = System.currentTimeMillis();
if ((fsc.lastaccesstime + accessDelay < currentTime)
&& (fsc.lastaccesstime + robotsDelay < currentTime) ) {
// enforce 15 sec delay between searches to same system, and also check any eventual robots.txt Crawl-delay directive
if (robotsEntry == null || !robotsEntry.isDisallowed(connectorURL)) {
// also check robots.txt exclusion
retset.add(fsc);
} else {
ConcurrentLog.warn("FederateSearchManager",
"Connector URL is disallowed by robots.txt : " + fsc.baseurl);
}
}
}
return retset;
}
/**
* Discover opensearch description links from local (embedded) Solr index
* using meta data field 'outboundlinks_tag_txt' and add found systems to
* the config file
*
* @return true if background discover job was started, false if job not
* started
*/
public boolean discoverFromSolrIndex(final Switchboard sb) {
if (sb == null) {
return false;
}
// check if needed Solr fields are available (selected)
if (!sb.index.fulltext().useWebgraph()) {
ConcurrentLog.severe("FederateSearchManager", "Error on connecting to embedded Solr webgraph index");
return false;
}
final SolrConnector connector = sb.index.fulltext().getWebgraphConnector();
final boolean metafieldavailable = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_rel_s.name())
&& (sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name()))
&& sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false);
if (!metafieldavailable) {
ConcurrentLog.warn("FederateSearchManager", "webgraph option and webgraph Schema fields target_rel_s, target_protocol_s and target_urlstub_s must be switched on");
return false;
}
// the solr search
final String webgraphquerystr = WebgraphSchema.target_rel_s.getSolrFieldName() + ":search";
final String[] webgraphqueryfields = {WebgraphSchema.target_protocol_s.getSolrFieldName(), WebgraphSchema.target_urlstub_s.getSolrFieldName()};
// alternatively target_protocol_s + "://" +target_host_s + target_path_s
final long numfound;
try {
SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, 0, 1, webgraphqueryfields);
numfound = docList.getNumFound();
if (numfound == 0) {
ConcurrentLog.info("FederateSearchManager", "no results found, abort discover job");
return true;
}
ConcurrentLog.info("FederateSearchManager", "start checking " + Long.toString(numfound) + " found index results");
} catch (final IOException ex) {
ConcurrentLog.logException(ex);
return false;
}
final long stoptime = System.currentTimeMillis() + 1000 * 3600; // make sure job doesn't run forever
// job to iterate through Solr index to find links to opensearchdescriptions
// started as background job as connect timeouts may cause it run a long time
final Thread job = new Thread(FederateSearchManager.class.getSimpleName() + ".discoverFromSolrIndex") {
@Override
public void run() {
try {
boolean doloop = true;
int loopnr = 0;
Set<String> dblmem = new HashSet<String>(); // temp memory for already checked url
while (doloop) {
ConcurrentLog.info("FederateSearchManager", "start Solr query loop at " + Integer.toString(loopnr * 20) + " of " + Long.toString(numfound));
SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, loopnr * 20, 20, webgraphqueryfields); // check chunk of 20 result documents
loopnr++;
if (stoptime < System.currentTimeMillis()) {// stop after max 1h
doloop = false;
ConcurrentLog.info("FederateSearchManager", "long running discover task aborted");
}
if (docList != null && docList.size() > 0) {
Iterator<SolrDocument> docidx = docList.iterator();
while (docidx.hasNext()) {
SolrDocument sdoc = docidx.next();
String hrefurltxt = sdoc.getFieldValue(WebgraphSchema.target_protocol_s.getSolrFieldName()) + "://" + sdoc.getFieldValue(WebgraphSchema.target_urlstub_s.getSolrFieldName());
URL url;
try {
url = new URL(hrefurltxt);
} catch (final MalformedURLException ex) {
ConcurrentLog.warn("FederateSearchManager", "OpenSearch description URL is malformed : " + hrefurltxt);
continue;
}
//TODO: check Blacklist
if (dblmem.add(url.getAuthority())) { // use only main path to detect double entries
opensearchdescriptionReader os = new opensearchdescriptionReader(hrefurltxt);
if (os.getRSSorAtomUrl() != null) {
/* Check eventual robots.txt policy */
RobotsTxtEntry robotsEntry = null;
MultiProtocolURL templateURL;
try {
templateURL = new MultiProtocolURL(os.getRSSorAtomUrl());
} catch (final MalformedURLException ex) {
ConcurrentLog.warn("FederateSearchManager", "OpenSearch description URL is malformed : " + hrefurltxt);
continue;
}
if(sb.robots != null) {
robotsEntry = sb.robots.getEntry(templateURL, ClientIdentification.yacyInternetCrawlerAgent);
}
if(robotsEntry != null && robotsEntry.isDisallowed(templateURL)) {
ConcurrentLog.info("FederateSearchManager", "OpenSearch description template URL is disallowed by robots.xt");
} else {
// add found system to config file
addOpenSearchTarget(os.getShortName(), os.getRSSorAtomUrl(), false, os.getItem("LongName"));
ConcurrentLog.info("FederateSearchManager", "added " + os.getShortName() + " " + hrefurltxt);
}
} else {
ConcurrentLog.info("FederateSearchManager", "osd.xml check failed (no RSS or Atom support) for " + hrefurltxt);
}
}
}
} else {
doloop = false;
}
}
ConcurrentLog.info("FederateSearchManager", "finisched Solr query (checked " + Integer.toString(dblmem.size()) + " unique opensearchdescription links found in " + Long.toString(numfound) + " results)");
} catch (final IOException ex) {
ConcurrentLog.logException(ex);
}
}
};
job.start();
return true;
}
/**
* Read or reread opensearch config file and initialize connectors
*
* @param cfgFileName
* @return true if successful
*/
public boolean init(String cfgFileName) {
confFile = new File(cfgFileName);
if (confFile.exists()) {
try {
cfg = new Configuration(confFile);
if (!this.conlist.isEmpty()) this.conlist.clear(); // prevent double entries
Iterator<Entry> it = cfg.entryIterator();
while (it.hasNext()) {
Entry cfgentry = it.next();
if (cfgentry.enabled()) { // hold only enabled in memory
String name = cfgentry.key();
String url = cfgentry.getValue();
if (url != null && !url.isEmpty()) {
if (url.startsWith("cfgfile:")) { // is cfgfile with field mappings (no opensearch url)
// config entry has 3 parts separated by : 1=cfgfile 2=connectortype 3=relative path to connector-cfg-file
// example cfgfile:solrconnector:testsys.solr.schema
String[] parts = url.split(":");
if (parts[1].equalsIgnoreCase("solrconnector")) {
SolrFederateSearchConnector sfc = new SolrFederateSearchConnector();
if (sfc.init(name, confFile.getParent()+"/federatecfg/"+parts[2])) {
conlist.add(sfc);
}
} else {
ConcurrentLog.config("FederateSearchManager", "Init error in configuration of: " + url);
}
} else { // handle opensearch url template
OpenSearchConnector osd = new OpenSearchConnector(url);
if (osd.init(name, confFile.getParent()+"/federatecfg/" + OpenSearchConnector.htmlMappingFileName(name))) {
conlist.add(osd);
}
}
}
}
}
} catch (IOException ex) {
ConcurrentLog.logException(ex);
}
}
return true;
}
}