/** * HostQueue * Copyright 2013 by Michael Christen * First released 24.09.2013 at http://yacy.net * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see <http://www.gnu.org/licenses/>. */ package net.yacy.crawler; import java.io.File; import java.io.IOException; import java.lang.reflect.Array; import java.net.MalformedURLException; import java.util.ArrayList; import java.util.ConcurrentModificationException; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.Latency; import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.robots.RobotsTxt; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.BufferedObjectIndex; import net.yacy.kelondro.index.Index; import net.yacy.kelondro.index.OnDemandOpenFileIndex; import net.yacy.kelondro.index.Row; import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.table.Table; import static net.yacy.kelondro.util.FileUtils.deletedelete; import net.yacy.kelondro.util.kelondroException; import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; public class HostQueue implements Balancer { private final static ConcurrentLog log = new ConcurrentLog("HostQueue"); public static final String indexSuffix = ".stack"; private static final int EcoFSBufferSize = 1000; private static final int objectIndexBufferSize = 1000; private final File hostPath; // path to the stack files private final String hostName; private final String hostHash; private final int port; private final boolean exceed134217727; private final boolean onDemand; private TreeMap<Integer, Index> depthStacks; /** * Create or open host queue. The host part of the hostUrl parameter is used * to calculate the stack directory name. * * @param hostsPath * @param hostUrl * @param onDemand * @param exceed134217727 * @throws MalformedURLException */ public HostQueue ( final File hostsPath, final DigestURL hostUrl, // any url from that host (only host data are extracted) final boolean onDemand, final boolean exceed134217727) throws MalformedURLException { this.onDemand = onDemand; this.exceed134217727 = exceed134217727; this.hostName = (hostUrl.getHost() == null) ? "localhost" : hostUrl.getHost(); // might be null (file://) but hostqueue needs a name (for queue file) this.port = hostUrl.getPort(); this.hostHash = hostUrl.hosthash(); // hosthash is calculated by protocol + hostname + port // hostName/port included just for human readability (& historically), "-#" marker used to define begin of hosthash in directoryname this.hostPath = new File(hostsPath, this.hostName + "-#"+ this.hostHash + "." + this.port); init(); } /** * Initializes host queue from cache files. The internal id of the queue is * extracted form the path name an must match the key initially generated * currently the hosthash is used as id. * @param hostPath path of the stack directory (containing the primary key/id of the queue) * @param onDemand * @param exceed134217727 * @throws MalformedURLException */ public HostQueue ( final File hostPath, final boolean onDemand, final boolean exceed134217727) throws MalformedURLException { this.onDemand = onDemand; this.exceed134217727 = exceed134217727; this.hostPath = hostPath; // parse the hostName and port from the file name String filename = hostPath.getName(); int pdot = filename.lastIndexOf('.'); if (pdot < 0) throw new RuntimeException("hostPath name must contain a dot: " + filename); this.port = Integer.parseInt(filename.substring(pdot + 1)); // consider "host.com" contains dot but no required port -> will throw exception int p1 = filename.lastIndexOf("-#"); if (p1 >= 0) { this.hostName = filename.substring(0,p1); this.hostHash = filename.substring(p1+2,pdot); } else throw new RuntimeException("hostPath name must contain -# followd by hosthash: " + filename); init(); } /** * Opens and initializes the host queue * @throws MalformedURLException if directory for the host could not be created */ private final void init() throws MalformedURLException { if (!(this.hostPath.exists())) { this.hostPath.mkdirs(); if (!this.hostPath.exists()) { // check if directory created (if not, likely a name violation) throw new MalformedURLException("hostPath could not be created: " + this.hostPath.toString()); } } this.depthStacks = new TreeMap<Integer, Index>(); int size = openAllStacks(); if (log.isInfo()) log.info("opened HostQueue " + this.hostPath.getAbsolutePath() + " with " + size + " urls."); } public String getHost() { return this.hostName; } public int getPort() { return this.port; } /** * Get the hosthash of this queue determined during init. * * @return */ public String getHostHash() { return this.hostHash; } private int openAllStacks() { String[] l = this.hostPath.list(); int c = 0; if (l != null) for (String s: l) { if (s.endsWith(indexSuffix)) try { int depth = Integer.parseInt(s.substring(0, s.length() - indexSuffix.length())); File stackFile = new File(this.hostPath, s); Index depthStack = openStack(stackFile); if (depthStack != null) { int sz = depthStack.size(); if (sz == 0) { depthStack.close(); deletedelete(stackFile); } else { this.depthStacks.put(depth, depthStack); c += sz; } } } catch (NumberFormatException e) {} } return c; } private Index getLowestStack() { while (this.depthStacks.size() > 0) { Map.Entry<Integer, Index> entry; synchronized (this) { entry = this.depthStacks.firstEntry(); } if (entry == null) return null; // happens only if map is empty if (entry.getValue().size() == 0) { entry.getValue().close(); deletedelete(getFile(entry.getKey())); this.depthStacks.remove(entry.getKey()); continue; } return entry.getValue(); } // this should not happen //assert false; return null; } /** * Get existing url stack with crawl depth or create a new (empty) stack * * @param depth * @return existing or new/empty stack */ private Index getStack(int depth) { Index depthStack; // create a new stack synchronized (this) { depthStack = this.depthStacks.get(depth); if (depthStack != null) return depthStack; // now actually create a new stack final File f = getFile(depth); depthStack = openStack(f); if (depthStack != null) this.depthStacks.put(depth, depthStack); } return depthStack; } private File getFile(int depth) { String name = Integer.toString(depth); while (name.length() < 4) name = "0" + name; final File f = new File(this.hostPath, name + indexSuffix); return f; } private Index openStack(File f) { for (int i = 0; i < 10; i++) { // we try that again if it fails because it shall not fail if (this.onDemand && (!f.exists() || f.length() < 10000)) { try { return new BufferedObjectIndex(new OnDemandOpenFileIndex(f, Request.rowdef, exceed134217727), objectIndexBufferSize); } catch (kelondroException e) { // possibly the file was closed meanwhile ConcurrentLog.logException(e); } } else { try { return new BufferedObjectIndex(new Table(f, Request.rowdef, EcoFSBufferSize, 0, false, exceed134217727, true), objectIndexBufferSize); } catch (final SpaceExceededException e) { try { return new BufferedObjectIndex(new Table(f, Request.rowdef, 0, 0, false, exceed134217727, true), objectIndexBufferSize); } catch (final SpaceExceededException e1) { ConcurrentLog.logException(e1); } } catch (kelondroException e) { // possibly the file was closed meanwhile ConcurrentLog.logException(e); } } } return null; } @Override public synchronized void close() { for (Map.Entry<Integer, Index> entry: this.depthStacks.entrySet()) { int size = entry.getValue().size(); entry.getValue().close(); if (size == 0) deletedelete(getFile(entry.getKey())); } this.depthStacks.clear(); String[] l = this.hostPath.list(); if ((l == null || l.length == 0) && this.hostPath != null) deletedelete(this.hostPath); } @Override public synchronized void clear() { for (Map.Entry<Integer, Index> entry: this.depthStacks.entrySet()) { entry.getValue().close(); deletedelete(getFile(entry.getKey())); } this.depthStacks.clear(); String[] l = this.hostPath.list(); if (l != null) for (String s: l) { deletedelete(new File(this.hostPath, s)); } deletedelete(this.hostPath); } @Override public Request get(final byte[] urlhash) throws IOException { assert urlhash != null; if (this.depthStacks == null) return null; // case occurs during shutdown for (Index depthStack: this.depthStacks.values()) { final Row.Entry entry = depthStack.get(urlhash, false); if (entry == null) return null; return new Request(entry); } return null; } @Override public int removeAllByProfileHandle(final String profileHandle, final long timeout) throws IOException, SpaceExceededException { // first find a list of url hashes that shall be deleted final long terminate = timeout == Long.MAX_VALUE ? Long.MAX_VALUE : (timeout > 0) ? System.currentTimeMillis() + timeout : Long.MAX_VALUE; int count = 0; synchronized (this) { for (Index depthStack: this.depthStacks.values()) { final HandleSet urlHashes = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 100); final Iterator<Row.Entry> i = depthStack.rows(); Row.Entry rowEntry; Request crawlEntry; while (i.hasNext() && (System.currentTimeMillis() < terminate)) { rowEntry = i.next(); crawlEntry = new Request(rowEntry); if (crawlEntry.profileHandle().equals(profileHandle)) { urlHashes.put(crawlEntry.url().hash()); } if (System.currentTimeMillis() > terminate) break; } for (final byte[] urlhash: urlHashes) { depthStack.remove(urlhash); count++; } } } return count; } /** * delete all urls which are stored for given host hashes * @param hosthashes * @return number of deleted urls */ @Override public int removeAllByHostHashes(final Set<String> hosthashes) { for (String h: hosthashes) { if (this.hostHash.equals(h)) { int s = this.size(); this.clear(); return s; } } return 0; } /** * remove urls from the queue * @param urlHashes, a list of hashes that shall be removed * @return number of entries that had been removed * @throws IOException */ @Override public synchronized int remove(final HandleSet urlHashes) throws IOException { int removedCounter = 0; for (Index depthStack: this.depthStacks.values()) { final int s = depthStack.size(); for (final byte[] urlhash: urlHashes) { final Row.Entry entry = depthStack.remove(urlhash); if (entry != null) removedCounter++; } if (removedCounter == 0) return 0; assert depthStack.size() + removedCounter == s : "urlFileIndex.size() = " + depthStack.size() + ", s = " + s; } return removedCounter; } @Override public boolean has(final byte[] urlhashb) { for (int retry = 0; retry < 3; retry++) { try { for (Index depthStack: this.depthStacks.values()) { if (depthStack.has(urlhashb)) return true; } return false; } catch (ConcurrentModificationException e) {} } return false; } @Override public int size() { int size = 0; for (Index depthStack: this.depthStacks.values()) { size += depthStack.size(); } return size; } @Override public boolean isEmpty() { for (Index depthStack: this.depthStacks.values()) { if (!depthStack.isEmpty()) return false; } return true; } @Override public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException { assert entry != null; final byte[] hash = entry.url().hash(); synchronized (this) { // double-check if (this.has(hash)) return "double occurrence in urlFileIndex"; // increase dom counter if (profile != null) { int maxPages = profile.domMaxPages(); if (maxPages != Integer.MAX_VALUE && maxPages > 0) { String host = entry.url().getHost(); profile.domInc(host); } } // add to index Index depthStack = getStack(entry.depth()); final int s = depthStack.size(); depthStack.put(entry.toRow()); assert s < depthStack.size() : "hash = " + ASCII.String(hash) + ", s = " + s + ", size = " + depthStack.size(); assert depthStack.has(hash) : "hash = " + ASCII.String(hash); } return null; } @Override public Request pop(boolean delay, CrawlSwitchboard cs, RobotsTxt robots) throws IOException { // returns a crawl entry from the stack and ensures minimum delta times long sleeptime = 0; Request crawlEntry = null; CrawlProfile profileEntry = null; synchronized (this) { mainloop: while (true) { Index depthStack = getLowestStack(); if (depthStack == null) return null; Row.Entry rowEntry = null; while (depthStack.size() > 0) { rowEntry = depthStack.removeOne(); if (rowEntry != null) break; } if (rowEntry == null) continue mainloop; crawlEntry = new Request(rowEntry); // check blacklist (again) because the user may have created blacklist entries after the queue has been filled if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, crawlEntry.url())) { if (log.isFine()) log.fine("URL '" + crawlEntry.url() + "' is in blacklist."); continue mainloop; } // at this point we must check if the crawlEntry has relevance because the crawl profile still exists // if not: return null. A calling method must handle the null value and try again profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle())); if (profileEntry == null) { if (log.isFine()) log.fine("no profile entry for handle " + crawlEntry.profileHandle()); continue mainloop; } // depending on the caching policy we need sleep time to avoid DoS-like situations sleeptime = Latency.getDomainSleepTime(robots, profileEntry, crawlEntry.url()); break; } } if (crawlEntry == null) return null; ClientIdentification.Agent agent = profileEntry == null ? ClientIdentification.yacyInternetCrawlerAgent : profileEntry.getAgent(); long robotsTime = Latency.getRobotsTime(robots, crawlEntry.url(), agent); Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime); if (delay && sleeptime > 0) { // force a busy waiting here // in best case, this should never happen if the balancer works properly // this is only to protection against the worst case, where the crawler could // behave in a DoS-manner if (log.isInfo()) log.info("forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, agent)); long loops = sleeptime / 1000; long rest = sleeptime % 1000; if (loops < 3) { rest = rest + 1000 * loops; loops = 0; } Thread.currentThread().setName("Balancer waiting for " + crawlEntry.url().getHost() + ": " + sleeptime + " milliseconds"); synchronized(this) { // must be synchronized here to avoid 'takeover' moves from other threads which then idle the same time which would not be enough if (rest > 0) {try {this.wait(rest);} catch (final InterruptedException e) {}} for (int i = 0; i < loops; i++) { if (log.isInfo()) log.info("waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining..."); try {this.wait(1000); } catch (final InterruptedException e) {} } } Latency.updateAfterSelection(crawlEntry.url(), robotsTime); } return crawlEntry; } @Override public Iterator<Request> iterator() throws IOException { final Iterator<Map.Entry<Integer, Index>> depthIterator = this.depthStacks.entrySet().iterator(); @SuppressWarnings("unchecked") final Iterator<Row.Entry>[] rowIterator = (Iterator<Row.Entry>[]) Array.newInstance(Iterator.class, 1); rowIterator[0] = null; return new Iterator<Request>() { @Override public boolean hasNext() { return depthIterator.hasNext() || (rowIterator[0] != null && rowIterator[0].hasNext()); } @Override public Request next() { synchronized (HostQueue.this) { try { while (rowIterator[0] == null || !rowIterator[0].hasNext()) { Map.Entry<Integer, Index> entry = depthIterator.next(); rowIterator[0] = entry.getValue().iterator(); } if (!rowIterator[0].hasNext()) return null; Row.Entry rowEntry = rowIterator[0].next(); if (rowEntry == null) return null; return new Request(rowEntry); } catch (Throwable e) { return null; } } } @Override public void remove() { rowIterator[0].remove(); } }; } /** * get a list of domains that are currently maintained as domain stacks * @return a map of clear text strings of host names to an integer array: {the size of the domain stack, guessed delta waiting time} */ @Override public Map<String, Integer[]> getDomainStackHosts(RobotsTxt robots) { Map<String, Integer[]> map = new TreeMap<String, Integer[]>(); int delta = Latency.waitingRemainingGuessed(this.hostName, this.port, this.hostHash, robots, ClientIdentification.yacyInternetCrawlerAgent); map.put(this.hostName, new Integer[]{this.size(), delta}); return map; } /** * get lists of crawl request entries for a specific host * @param host * @param maxcount * @param maxtime * @return a list of crawl loader requests */ @Override public List<Request> getDomainStackReferences(String host, int maxcount, long maxtime) { if (host == null) return new ArrayList<Request>(0); if (!this.hostName.equals(host)) return new ArrayList<Request>(0); final ArrayList<Request> cel = new ArrayList<Request>(maxcount); long timeout = maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime; Iterator<Request> i; try { i = this.iterator(); while (i.hasNext()) { Request r = i.next(); if (r != null) cel.add(r); if (System.currentTimeMillis() > timeout || cel.size() >= maxcount) break; } } catch (IOException e) { } return cel; } @Override public int getOnDemandLimit() { throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. } @Override public boolean getExceed134217727() { return this.exceed134217727; } }