// CrawlSwitchboard.java // (C) 2005, 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 2005 on http://www.anomic.de // // This is a part of YaCy, a peer-to-peer based web search engine // // $LastChangedDate$ // $LastChangedRevision$ // $LastChangedBy$ // // LICENSE // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package net.yacy.crawler; import java.io.File; import java.io.IOException; import java.util.Collections; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.concurrent.ConcurrentHashMap; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.data.NoticedURL.StackType; import net.yacy.crawler.retrieval.Request; import net.yacy.kelondro.blob.MapHeap; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.kelondroException; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; public final class CrawlSwitchboard { public static final String CRAWL_PROFILE_AUTOCRAWL_DEEP = "autocrawlDeep"; public static final String CRAWL_PROFILE_AUTOCRAWL_SHALLOW = "autocrawlShallow"; public static final String CRAWL_PROFILE_PROXY = "proxy"; public static final String CRAWL_PROFILE_REMOTE = "remote"; public static final String CRAWL_PROFILE_SNIPPET_LOCAL_TEXT = "snippetLocalText"; public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT = "snippetGlobalText"; public static final String CRAWL_PROFILE_GREEDY_LEARNING_TEXT = "snippetGreedyLearningText"; public static final String CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA = "snippetLocalMedia"; public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA = "snippetGlobalMedia"; public static final String CRAWL_PROFILE_SURROGATE = "surrogates"; public static Set<String> DEFAULT_PROFILES = new HashSet<String>(); static { DEFAULT_PROFILES.add(CRAWL_PROFILE_AUTOCRAWL_DEEP); DEFAULT_PROFILES.add(CRAWL_PROFILE_AUTOCRAWL_SHALLOW); DEFAULT_PROFILES.add(CRAWL_PROFILE_PROXY); DEFAULT_PROFILES.add(CRAWL_PROFILE_REMOTE); DEFAULT_PROFILES.add(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT); DEFAULT_PROFILES.add(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT); DEFAULT_PROFILES.add(CRAWL_PROFILE_GREEDY_LEARNING_TEXT); DEFAULT_PROFILES.add(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA); DEFAULT_PROFILES.add(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA); DEFAULT_PROFILES.add(CRAWL_PROFILE_SURROGATE); } public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive1.heap"; public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive1.heap"; public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L; public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; public static final long CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; public static final long CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L; public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L; public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L; private final ConcurrentLog log; private MapHeap profilesActiveCrawls; private final MapHeap profilesPassiveCrawls; private final Map<byte[], CrawlProfile> profilesActiveCrawlsCache; //TreeMap<byte[], DigestURI>(Base64Order.enhancedCoder); private final Map<String, RowHandleSet> profilesActiveCrawlsCounter; public CrawlProfile defaultProxyProfile, defaultRemoteProfile, defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile; public CrawlProfile defaultTextGreedyLearningProfile, defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile, defaultSurrogateProfile; public CrawlProfile defaultAutocrawlDeepProfile, defaultAutocrawlShallowProfile; private Map<String, CrawlProfile> defaultPushProfiles; // for each collection one profile private final File queuesRoot; private Switchboard switchboard; public CrawlSwitchboard(Switchboard switchboard) { this.switchboard = switchboard; this.log = this.switchboard.log; this.queuesRoot = this.switchboard.queuesRoot; this.defaultPushProfiles = new ConcurrentHashMap<>(); this.profilesActiveCrawlsCache = Collections.synchronizedMap(new TreeMap<byte[], CrawlProfile>(Base64Order.enhancedCoder)); this.profilesActiveCrawlsCounter = new ConcurrentHashMap<String, RowHandleSet>(); // make crawl profiles database and default profiles this.queuesRoot.mkdirs(); this.log.config("Initializing Crawl Profiles"); final File profilesActiveFile = new File(queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES); this.profilesActiveCrawls = loadFromDB(profilesActiveFile); for ( final byte[] handle : this.profilesActiveCrawls.keySet() ) { CrawlProfile p; try { p = new CrawlProfile(this.profilesActiveCrawls.get(handle)); } catch (final IOException e ) { p = null; } catch (final SpaceExceededException e ) { p = null; } if ( p == null ) { continue; } } initActiveCrawlProfiles(); log.info("Loaded active crawl profiles from file " + profilesActiveFile.getName() + ", " + this.profilesActiveCrawls.size() + " entries"); final File profilesPassiveFile = new File(queuesRoot, DBFILE_PASSIVE_CRAWL_PROFILES); this.profilesPassiveCrawls = loadFromDB(profilesPassiveFile); for ( final byte[] handle : this.profilesPassiveCrawls.keySet() ) { CrawlProfile p; try { p = new CrawlProfile(this.profilesPassiveCrawls.get(handle)); ConcurrentLog.info("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.collectionName()); } catch (final IOException e ) { continue; } catch (final SpaceExceededException e ) { continue; } } log.info("Loaded passive crawl profiles from file " + profilesPassiveFile.getName() + ", " + this.profilesPassiveCrawls.size() + " entries" + ", " + profilesPassiveFile.length() / 1024); } /** * Get a profile from active or passive stack. Should be used to be sure not to miss old, cleaned profiles. * A profile that was discovered from the passive stack is automatically shifted back to the active stack. * @param profileKey * @return */ public CrawlProfile get(final byte[] profileKey) { CrawlProfile profile = getActive(profileKey); if (profile != null) return profile; profile = getPassive(profileKey); if (profile == null) return null; // clean up this.putActive(profileKey, profile); this.removePassive(profileKey); return profile; } public CrawlProfile getActive(final byte[] profileKey) { if ( profileKey == null ) { return null; } // get from cache CrawlProfile p = this.profilesActiveCrawlsCache.get(profileKey); if ( p != null ) { return p; } // get from db Map<String, String> m; try { m = this.profilesActiveCrawls.get(profileKey); } catch (final IOException e ) { m = null; } catch (final SpaceExceededException e ) { m = null; } if ( m == null ) { return null; //return getPassive(profileKey); } p = new CrawlProfile(m); this.profilesActiveCrawlsCache.put(profileKey, p); return p; } public CrawlProfile getPassive(final byte[] profileKey) { if ( profileKey == null ) { return null; } Map<String, String> m; try { m = this.profilesPassiveCrawls.get(profileKey); } catch (final IOException e ) { m = null; } catch (final SpaceExceededException e ) { m = null; } if ( m == null ) { return null; } return new CrawlProfile(m); } public Set<byte[]> getActive() { return this.profilesActiveCrawls.keySet(); } public Set<byte[]> getPassive() { return this.profilesPassiveCrawls.keySet(); } public void removeActive(final byte[] profileKey) { if ( profileKey == null ) { return; } this.profilesActiveCrawlsCache.remove(profileKey); this.profilesActiveCrawls.remove(profileKey); } public void removePassive(final byte[] profileKey) { if ( profileKey == null ) { return; } this.profilesPassiveCrawls.remove(profileKey); } public void putActive(final byte[] profileKey, final CrawlProfile profile) { this.profilesActiveCrawls.put(profileKey, profile); this.profilesActiveCrawlsCache.put(profileKey, profile); this.removePassive(profileKey); } public void putPassive(final byte[] profileKey, final CrawlProfile profile) { this.profilesPassiveCrawls.put(profileKey, profile); this.removeActive(profileKey); } public RowHandleSet getURLHashes(final byte[] profileKey) { return this.profilesActiveCrawlsCounter.get(ASCII.String(profileKey)); } private void initActiveCrawlProfiles() { final Switchboard sb = Switchboard.getSwitchboard(); // generate new default entry for deep auto crawl this.defaultAutocrawlDeepProfile = new CrawlProfile( CRAWL_PROFILE_AUTOCRAWL_DEEP, CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_DEPTH, "3")), true, CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440), -1, true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_TEXT, true), sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_MEDIA, true), false, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.NOCACHE, "robot_" + CRAWL_PROFILE_AUTOCRAWL_DEEP, ClientIdentification.yacyInternetCrawlerAgentName, null, 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultAutocrawlDeepProfile.handle()), this.defaultAutocrawlDeepProfile); // generate new default entry for shallow auto crawl this.defaultAutocrawlShallowProfile = new CrawlProfile( CRAWL_PROFILE_AUTOCRAWL_SHALLOW, CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_DEPTH, "1")), true, CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440), -1, true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_TEXT, true), sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_MEDIA, true), false, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.NOCACHE, "robot_" + CRAWL_PROFILE_AUTOCRAWL_SHALLOW, ClientIdentification.yacyInternetCrawlerAgentName, null, 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultAutocrawlShallowProfile.handle()), this.defaultAutocrawlShallowProfile); // generate new default entry for proxy crawling this.defaultProxyProfile = new CrawlProfile( CRAWL_PROFILE_PROXY, CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch Integer.parseInt(sb.getConfig(SwitchboardConstants.PROXY_PREFETCH_DEPTH, "0")), true, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_TEXT, true), sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true), true, sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_REMOTE, false), -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH, "robot_" + CRAWL_PROFILE_PROXY, ClientIdentification.yacyProxyAgentName, null, 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultProxyProfile.handle()), this.defaultProxyProfile); // generate new default entry for remote crawling this.defaultRemoteProfile = new CrawlProfile( CRAWL_PROFILE_REMOTE, CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch 0, false, null, -1, true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, true, true, false, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH, "robot_" + CRAWL_PROFILE_REMOTE, ClientIdentification.yacyInternetCrawlerAgentName, null, 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultRemoteProfile.handle()), this.defaultRemoteProfile); // generate new default entry for snippet fetch and optional crawling this.defaultTextSnippetLocalProfile = new CrawlProfile( CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch 0, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, false, false, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, ClientIdentification.yacyIntranetCrawlerAgentName, null, 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), this.defaultTextSnippetLocalProfile); // generate new default entry for snippet fetch and optional crawling this.defaultTextSnippetGlobalProfile = new CrawlProfile( CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch 0, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, true, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, ClientIdentification.yacyIntranetCrawlerAgentName, null, 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile); this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST); // generate new default entry for greedy learning this.defaultTextGreedyLearningProfile = new CrawlProfile( CRAWL_PROFILE_GREEDY_LEARNING_TEXT, CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch 0, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE), -1, true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, false, false, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT, ClientIdentification.browserAgentName, null, 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile); // generate new default entry for snippet fetch and optional crawling this.defaultMediaSnippetLocalProfile = new CrawlProfile( CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch 0, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, false, // indexText false, // indexMedia true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, ClientIdentification.yacyIntranetCrawlerAgentName, null, 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), this.defaultMediaSnippetLocalProfile); // generate new default entry for snippet fetch and optional crawling this.defaultMediaSnippetGlobalProfile = new CrawlProfile( CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch 0, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, false, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, ClientIdentification.yacyIntranetCrawlerAgentName, null, 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), this.defaultMediaSnippetGlobalProfile); // generate new default entry for surrogate parsing this.defaultSurrogateProfile = new CrawlProfile( CRAWL_PROFILE_SURROGATE, CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch 0, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, true, false, false, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.NOCACHE, "robot_" + CRAWL_PROFILE_SURROGATE, ClientIdentification.yacyIntranetCrawlerAgentName, null, 0); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultSurrogateProfile.handle()), this.defaultSurrogateProfile); } public CrawlProfile getPushCrawlProfile(String collection) { CrawlProfile genericPushProfile = this.defaultPushProfiles.get(collection); if (genericPushProfile != null) return genericPushProfile; genericPushProfile = new CrawlProfile( CrawlProfile.CRAWL_PROFILE_PUSH_STUB + collection, CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch 0, false, null, -1, true, true, false, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, true, true, false, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.NOCACHE, collection, ClientIdentification.yacyIntranetCrawlerAgentName, null, 0); this.profilesActiveCrawls.put(UTF8.getBytes(genericPushProfile.handle()), genericPushProfile); this.defaultPushProfiles.put(collection, genericPushProfile); return genericPushProfile; } private void resetProfiles() { this.profilesActiveCrawlsCache.clear(); final File pdb = new File(this.queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES); if ( pdb.exists() ) { FileUtils.deletedelete(pdb); } try { this.profilesActiveCrawls = new MapHeap(pdb, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' '); } catch (final IOException e1 ) { ConcurrentLog.logException(e1); this.profilesActiveCrawls = null; } initActiveCrawlProfiles(); } public boolean clear() throws InterruptedException { this.profilesActiveCrawlsCache.clear(); CrawlProfile entry; boolean hasDoneSomething = false; try { for ( final byte[] handle : this.profilesActiveCrawls.keySet() ) { // check for interruption if ( Thread.currentThread().isInterrupted() ) { throw new InterruptedException("Shutdown in progress"); } // getting next profile try { entry = new CrawlProfile(this.profilesActiveCrawls.get(handle)); } catch (final IOException e ) { continue; } catch (final SpaceExceededException e ) { continue; } if (!DEFAULT_PROFILES.contains(entry.name())) { final CrawlProfile p = new CrawlProfile(entry); this.profilesPassiveCrawls.put(UTF8.getBytes(p.handle()), p); this.profilesActiveCrawls.remove(handle); hasDoneSomething = true; } } } catch (final kelondroException e ) { resetProfiles(); hasDoneSomething = true; } return hasDoneSomething; } public Set<String> getActiveProfiles() { // find all profiles that are candidates for deletion Set<String> profileKeys = new HashSet<String>(); for (final byte[] handle: this.getActive()) { CrawlProfile entry; entry = new CrawlProfile(this.getActive(handle)); if (!CrawlSwitchboard.DEFAULT_PROFILES.contains(entry.name())) { profileKeys.add(ASCII.String(handle)); } } return profileKeys; } public Set<String> getFinishedProfiles(CrawlQueues crawlQueues) { // clear the counter cache this.profilesActiveCrawlsCounter.clear(); // find all profiles that are candidates for deletion Set<String> deletionCandidate = getActiveProfiles(); if (deletionCandidate.size() == 0) return new HashSet<String>(0); // iterate through all the queues and see if one of these handles appear there // this is a time-consuming process, set a time-out long timeout = System.currentTimeMillis() + 60000L; // one minute time try { for (StackType stack: StackType.values()) { Iterator<Request> sei = crawlQueues.noticeURL.iterator(stack); if (sei == null) continue; Request r; while (sei.hasNext()) { r = sei.next(); if (r == null) continue; String handle = r.profileHandle(); RowHandleSet us = this.profilesActiveCrawlsCounter.get(handle); if (us == null) {us = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0); this.profilesActiveCrawlsCounter.put(handle, us);} if (us.size() < 100) us.put(r.url().hash()); // store the hash, but not too many deletionCandidate.remove(handle); if (deletionCandidate.size() == 0) return new HashSet<String>(0); if (System.currentTimeMillis() > timeout) return new HashSet<String>(0); // give up; this is too large } if (deletionCandidate.size() == 0) return new HashSet<String>(0); } // look into the CrawlQueues.worker as well Map<DigestURL, Request> map = switchboard.crawlQueues.activeWorkerEntries(); for (Request request: map.values()) { deletionCandidate.remove(request.profileHandle()); } } catch (final Throwable e) { ConcurrentLog.logException(e); return new HashSet<String>(0); } return deletionCandidate; } public boolean allCrawlsFinished(CrawlQueues crawlQueues) { if (!crawlQueues.noticeURL.isEmpty()) return false; // look into the CrawlQueues.worker as well if (switchboard.crawlQueues.activeWorkerEntries().size() > 0) return false; return true; } public void cleanProfiles(Set<String> deletionCandidate) { // all entries that are left are candidates for deletion; do that now for (String h: deletionCandidate) { byte[] handle = ASCII.getBytes(h); final CrawlProfile p = this.getActive(handle); if (p != null) { this.putPassive(handle, p); this.removeActive(handle); } } } public synchronized void close() { this.profilesActiveCrawlsCache.clear(); this.profilesActiveCrawls.close(); this.profilesPassiveCrawls.close(); } /** * Loads crawl profiles from a DB file. * * @param file DB file * @return crawl profile data */ private static MapHeap loadFromDB(final File file) { MapHeap ret; try { ret = new MapHeap(file, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' '); } catch (final IOException e ) { ConcurrentLog.logException(e); FileUtils.deletedelete(file); try { ret = new MapHeap(file, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' '); } catch (final IOException e1 ) { ConcurrentLog.logException(e1); ret = null; } } return ret; } }